def tokenize_sentences(domain_concept, text):
    my_stop_words = ['say', '\'s', 'be', 'says', 'including', 'said', 'named', '\t', 'know', '\n\n', 'Des', ' ', '']
    for stop_word in my_stop_words:
        lexeme = nlp.vocab[stop_word]
        lexeme.is_stop = True

    sentences = []

    if not isinstance(text, float):
        text_sentences = textcleaner.split_sentences(text)
        cleaned_sentences = []

        for sent in text_sentences:
            cleaned_sentences.append(nlp(sent.lower()))

        for sentence in cleaned_sentences:
            sent = []
            for w in sentence:
                if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text) > 1:
                    sent.append(w.text.strip())
            sentences.append(sent)
    else:
        logging.warning(domain_concept + ": does not have summary")
        pass

    return sentences
Exemplo n.º 2
0
def edit_article(article):
    """ Gets an article db record reference edits it and saves it.
    """
    update_log.info('Editing {}'.format(article.original_title))
    summary = summarize(article.original_text)

    if article.original_language != EN:
        translate_langs = "{}-{}".format(article.original_language, EN)
        try:
            title = translate_this(article.original_title, translate_langs)
            summary = translate_this(summary, translate_langs)
        except Exception as err:
            update_log.error(err)
            return None
    else:
        title = article.title

    if summary is not None:
        article.title = title
        html_summary = ""
        for sent in textcleaner.split_sentences(summary):
            html_summary += "<p>{}</p>".format(sent)
        article.summary = html_summary
        article.keywords = gn_keywords(summary).replace("\n", ", ")
        article.status = READY
        article.save()
        update_log.info('Editing finished successfully!')
    else:
        update_log.error('Could not finished editing the article.')

    return article
Exemplo n.º 3
0
def summarize(text, word_count=256):
    """
    gensim summarizer 이용
    https://github.com/anmolgulati/gensim/blob/df238ef1bc71568819ba92502f0e9df46b933698/gensim/summarization/summarizer.py
    corpus로 만든 후 word가 3개 이하면 워닝 발생, sentencerk 10개 이하면 warning 발생시킴
    """
    # Check if the text is too short.
    MIN_WORD_LENGTH = word_count
    MIN_SENTENCE_LENGTH = 2

    word_num = len(set(text.split()))
    if word_num < MIN_WORD_LENGTH:
        return text
    else:
        sentence_num = len(split_sentences(text))
        if sentence_num < MIN_SENTENCE_LENGTH:
            # logger.debug('too short text')
            # print('too short text')
            return text

    text_summarized = textrank_summarizer(text, word_count=word_count)
    text_summarized = re.sub('\n', ' ', text_summarized)
    if len(text_summarized) == 0:
        return text

    return text_summarized
Exemplo n.º 4
0
def summary(x, perc):  # x input document, perc: percentage of the original document to keep
    if len(split_sentences(x)) > 10:
        test_summary = summarize(x, ratio=perc, split=True)
        test_summary = '\n'.join(map(str, f(test_summary)))
    else:
        test_summary = x
    return test_summary
Exemplo n.º 5
0
def create_paragraphs(article_text, paragraph_size):
    sentence_list = split_sentences(article_text)
    sentences_groups_list = group_sentences(sentence_list, 4)
    new_article = ""
    for group in sentences_groups_list:
        paragraph = " ".join(group)
        new_article = f"{new_article}{paragraph}<BR/><BR/>"

    return new_article
Exemplo n.º 6
0
def readCorpus(fname, tokens_only=False, mode='w'):
    tokens = []
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            if(mode == 's'):
                 tokens.append(split_sentences(remove_stopwords(line)))
            else:  # Train text with or without tags
                 tokens.append(gensim.utils.simple_preprocess(remove_stopwords(line)))
    return tokens
def clean_data_to_format(directory, partition, part):
    print('Begin reading of data')
    _, texts = select_partition(directory, partition, part)
    print('Begin preprocessing of data')
    output_doc = ''
    output_sum = ''
    for text in texts:
        document, summary = split_doc(text)
        original_document = split_sentences(document)
        original_summary = split_sentences(summary)
        original_document = ' ###SENT### '.join(original_document)
        original_summary = ' ####SENT### '.join(original_summary)
        output_doc = output_doc + '\"' + original_document + '\" \n'
        output_sum = output_sum + '\"' + original_summary + '\" \n'

    print('Saving data')
    save_texts(directory, part + '.src', [output_doc], [''])
    save_texts(directory, part + '.tgt', [output_sum], [''])
    print('Saved data')
Exemplo n.º 8
0
 def summarize(self, msgs, range_spec=None):
     """Return a summary of the text
     TODO: 1. Looks like spacy is not getting the main sentence from the message.
     2. Load times for the spacy summarizer won't cut it. Commenting out now 
        until this can be fixed
     """
     if not msgs or len(msgs) == 0:
         self.logger.warn("No messages to form summary")
         return u"\n Unable to form summary here.\n"
     txt = range_spec['txt'] if range_spec else u'Summary is'
     size = range_spec['size'] if range_spec and 'size' in range_spec else 3
     summ = txt + u' '
     #limit canonical dictionary to top 200 docs
     can_dict = {canonicalize(get_msg_text(msg)) : msg for msg in msgs}
     top_keys = sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:300]
     can_dict = {key: can_dict[key] for key in top_keys}
     self.logger.info("Length of can_dict is %s", len(can_dict))
     simple_sum = u'\n'.join([self.tagged_sum(can_dict[ss]) for ss in sorted(can_dict.keys(), key=lambda x: len(x.split()), reverse=True)[:3]])
     # If the number of messages or vocabulary is too low, just look for a
     # promising set of messages
     if len(msgs) < 11 or len(can_dict) < 11:
         #return the longest
         self.logger.warn("Too few messages for NLP.")
         summ += simple_sum
     else:
         max_sents = {}
         for (txt, msg) in can_dict.items():
             if len(txt.split()) > 3:
                 #Use the same splitting that gensim does
                 for snt in split_sentences(txt):
                     if len(snt.split()) > 100:
                         snt = u' '.join(snt.split()[:100])
                     max_sents[snt] = msg
         ratio = (size * 2)/ float(len(max_sents.keys()))
         #ratio = 0.3
         sent1 = u' '.join(can_dict.keys())
         sent2 = u' '.join(max_sents.keys())
         gn_sum = gs_sumrz(sent1, ratio=ratio, split=True)[:size]
         mx_sum = gs_sumrz(sent2, ratio=ratio, split=True)[:size]
         self.logger.info("Gensim sum %s", gn_sum)
         gs_summ = u'\n'.join([self.tagged_sum(can_dict[ss] if ss in can_dict else max_sents[ss]) for ss in gn_sum if len(ss) > 1 and (ss in max_sents or ss in can_dict)])
         for ss in mx_sum:
             if ss not in max_sents and ss not in can_dict and len(ss.split()) > 5:
                 self.logger.info("Searching for: %s", ss)
                 for (ky, msg) in max_sents.items():
                     if ss in ky or (len(ky.split()) > 10 and ky in ss):
                         gs_summ += u'\n' + self.tagged_sum(msg)
         if len(gn_sum) > 1:
             summ += gs_summ
         else:
             self.logger.warn("NLP Summarizer produced null output %s", gs_summ)
             summ += simple_sum
     self.logger.info("Summary for segment %s is %s", msgs, summ) 
     return summ
Exemplo n.º 9
0
Arquivo: ex9.py Projeto: mat-hek/pjn
def mk_bigrams():
    with open(dump_base + "judgments", 'r', encoding="utf-8") as f:
        judgments = f.read()

    sentences = [list(gensim.utils.simple_tokenize(s)) for s in textcleaner.split_sentences(judgments)]

    bigramer = Phraser(Phrases(sentences))

    bigramer.save(dump_base + "bigramer")

    return [bigramer[s] for s in sentences]
Exemplo n.º 10
0
def function_summarize():
    text = request.form['text']
    sentences = split_sentences(text)
    if len(sentences) < 5:
        return jsonify({
            "ERROR":
            "Not enough sentences found. There must be at least 5 sentences for summary."
        }), 400
    processed_text = summarize(text)
    print(processed_text)
    dict_sample = {'key': processed_text}
    return jsonify(dict_sample)
Exemplo n.º 11
0
    def summarize_text(self, text: str):
        if len(split_sentences(text)) > 1:
            try:
                pred: str = summarize(text, **self.model_params)
            except ValueError:
                pred = text
        else:
            pred: str = text
        if not pred:
            pred = 'none'

        return pred
Exemplo n.º 12
0
def split_and_preprocess(text: str,
                         token_filters: List[Callable]) -> List[str]:
    # step 1
    original_sentences = split_sentences(text)

    # step two
    filtered_sentences = []
    for sentence in original_sentences:
        processed_sentence = preprocess_string(sentence, filters=token_filters)
        filtered_sentences.append(" ".join(processed_sentence))

    sentences = merge_syntactic_units(original_sentences, filtered_sentences)
    return sentences
Exemplo n.º 13
0
def textrank_summarize(corpus):
    print("Begin summarizing...")

    list_of_summarization = []

    error_counter = 0
    null_summarization_counter = 0
    for i in range(len(corpus)):
        sample = corpus[i].strip()
        articles = sample.split("story_separator_special_tag")

        try:
            summarization = summarize("\n".join(articles),
                                      word_count=500,
                                      split=True)
            if len(summarization) == 0:
                null_summarization_counter += 1
                summarization = split_sentences("\n".join(articles))
                if len(summarization) == 0:
                    print("*** No Summarization ***", i)
        except ValueError:
            print("ValueError, sample", sample)
            summarization = sample
            list_of_summarization.append(summarization)
            error_counter += 1
            continue

        tmp_list_of_summarization = [[] for _ in range(len(articles))]
        for sent in summarization:
            flag = 0
            for j in range(len(articles)):
                if sent in articles[j]:
                    tmp_list_of_summarization[j].append(sent)
                    flag = 1
            if flag == 0:
                print(i, "****", sent, (sent in " ".join(articles)))

        for k in range(len(tmp_list_of_summarization)):
            tmp_list_of_summarization[k] = " newline_char ".join(
                tmp_list_of_summarization[k])

        list_of_summarization.append(
            " story_separator_special_tag ".join(tmp_list_of_summarization))

        if i % 100 == 0:
            print(i)
            print("------")
        # if i == 5000:
        # 	break

    return list_of_summarization, error_counter, null_summarization_counter
Exemplo n.º 14
0
def processFile(sample):

	# read file from provided folder path
	# f = open(file_name,'r')
	# text_0 = f.read()
	text_0 = sample

	# extract content in TEXT tag and remove tags
	# text_1 = re.search(r"<TEXT>.*</TEXT>",text_0, re.DOTALL)
	# text_1 = re.sub("<TEXT>\n","",text_1.group(0))
	# text_1 = re.sub("\n</TEXT>","",text_1)

	# # replace all types of quotations by normal quotes
	# text_1 = re.sub("\n"," ",text_1)
	
	# text_1 = re.sub("\"","\"",text_1)
	# text_1 = re.sub("''","\"",text_1)
	# text_1 = re.sub("``","\"",text_1)	
	
	# text_1 = re.sub(" +"," ",text_1)

	# segment data into a list of sentences
	# sentence_token = nltk.data.load('tokenizers/punkt/english.pickle')
	# lines = sentence_token.tokenize(text_1.strip())	
	lines = split_sentences(text_0 + "\n")

	# setting the stemmer
	sentences = []
	porter = nltk.PorterStemmer()

	# modelling each sentence in file as sentence object
	for line in lines:

		# original words of the sentence before stemming
		originalWords = line[:]
		line = line.strip().lower()

		# word tokenization
		sent = nltk.word_tokenize(line)
		
		# stemming words
		stemmedSent = [porter.stem(word) for word in sent]		
		# stemmedSent = filter(lambda x: x!='.'and x!='`'and x!=','and x!='?'and x!="'" 
		# 	and x!='!' and x!='''"''' and x!="''" and x!="'s", stemmedSent)
		
		# list of sentence objects
		if stemmedSent != []:
			# sentences.append(sentence.sentence(file_name, stemmedSent, originalWords))	
			sentences.append(sentence.sentence(stemmedSent, originalWords))			
	
	return sentences
Exemplo n.º 15
0
    def __init__(self, filename, tokenize=True):
        self.filename = filename
        with open(filename, 'r') as myfile:
            self.text = myfile.read()
            # -- Convert strange utf-8 bytes into punctuations -- #
            self.replaceStrangeChrs()
        # -- Preprocessing: generate a list of sentences -- #
        self.listOfSentences = [
            sentence.lower() for sentence in split_sentences(self.text)
        ]

        # -- Optional: Perform word tokenization -- #
        if tokenize:
            self.tokenizeSentences()
Exemplo n.º 16
0
def conversion(training_data_csv, vector_csv):
    training_data_csv.dropna(inplace=True)

    for index, datapoint in training_data_csv.iterrows():
        temp_list = []
        for sentence in split_sentences(datapoint.text):
            sentence = re.sub("[^A-Za-z]+", ' ', str(sentence)).lower()
            sentence = re.sub(r'\s+', ' ', str(sentence))
            #print(word_tokenize(sentence))
            temp_list.append(encode(word_tokenize(sentence)))
        temp_df = np.array(temp_list)
        temp_df = np.average(temp_df, axis=0)
        temp_df = DataFrame([[str(temp_df)]], columns=['vector'])
        temp_df.to_csv(vector_csv, index=False, header=False, mode='a')
Exemplo n.º 17
0
def mymemory_translate(text, languages="el-en"):
    daily_limit = 1000
    c = cache.get_item('mymemory_words_remaining')
    if c is None or c.is_expired():
        c = cache.set_item('mymemory_words_remaining', daily_limit)
        c.set_expiration_date(timezone.now() + relativedelta(days=+1))

    url = "http://api.mymemory.translated.net/get"
    langpair = languages.replace("-", "|")

    words_to_send = len(re.findall(r'\w+', text))
    words_remaining = int(c.value)
    print('==> Words to send:')
    print(words_to_send)

    def translate(sentence):
        params = {"q": sentence, "langpair": langpair}
        session = get_tor_session()
        response_object = session.post(url, params)
        response = json.loads(response_object.text)
        if response['responseStatus'] != 200:
            update_log.warning('MyMemory responded with {}'.format(
                response['responseStatus']))
        else:
            return response['responseData']['translatedText']

    if words_remaining > words_to_send:
        translated_text = ""
        # The limit of characters for each request is 500
        if len(text) > 500:
            sentences = textcleaner.split_sentences(text)
            for sent in sentences:
                sentence = translate(sent)
                if sent is not None:
                    translated_text += sentence + "\r\n"
                else:
                    return None
        else:
            translated_text = translate(text)

        words_remaining -= int(words_to_send)
        c.set_value(words_remaining)
        return translated_text
    else:
        update_log.warning('MyMemory reached the daily limit.')
        return None
def tokenize_sentences(domain_concept, text):
    sentences = []
    if not isinstance(text, float):
        text_sentences = textcleaner.split_sentences(text)
        cleaned_sentences = []
        for sent in text_sentences:
            cleaned_sentences.append(nlp(sent.lower()))
        for sentence in cleaned_sentences:
            sent = []
            for w in sentence:
                if (not w.is_stop) and (not w.is_punct) and (not w.like_num) and (not w.like_url) and \
                        ('\n' not in w.text) and (' ' not in w.text) and (len(w.text) > 1):
                    sent.append(lemmatizer.lemmatize(w.text.strip()))
            sentences.append(sent)
    else:
        logging.warning(domain_concept + ": does not have summary")
        pass
    return sentences
Exemplo n.º 19
0
def lexrank_summarize(corpus):
	list_of_summarization = []

	documents = [ split_sentences(sample.replace("story_separator_special_tag", "\n")) for sample in corpus ]
	print("[" + "Document Size: " + str(len(documents)) + "]")
	print("[" + time.strftime("%H:%M:%S", time.localtime()) + "]", "Begin building LexRank model...")	
	lxr = LexRank(documents, stopwords=STOPWORDS['en'])
	print("[" + time.strftime("%H:%M:%S", time.localtime()) + "]", "LexRank model successfully built...")

	for i in range(len(documents)):
		sample = documents[i]
		summary = lxr.get_summary(sample, summary_size=len(sample))
		articles = corpus[i].split("story_separator_special_tag")

		words_counter = 0
		summary_counter = 0
		tmp_summary = [ [] for _ in range(len(articles)) ]

		while words_counter < 500 and summary_counter < len(summary):
			flag = 0
			for j in range(len(articles)):
				if summary[summary_counter] in articles[j]:
					tmp_summary[j].append(summary[summary_counter])
					words_counter += len(summary[summary_counter].split(" "))
					flag = 1
			if flag == 0:
				print("[Error] Summary not in original sample.", summary[summary_counter], i)
			summary_counter += 1
			
		# print("words_counter, summary_counter, total summary", words_counter, summary_counter, len(summary))
		for k in range(len(tmp_summary)):
			tmp_summary[k] = " newline_char ".join(tmp_summary[k])
		list_of_summarization.append(" story_separator_special_tag ".join(tmp_summary))

		if i %100 == 0:
			print("------")
			print(i)
			print("------")
		# if i == 100:
		# 	break

	return list_of_summarization
Exemplo n.º 20
0
def get_sentences(n_bytes, text):
    sents = []
    count = 0
    order = 0
    prev_doc = ""
    # split into sentences with gensim splitter
    for line in split_sentences(text):
        doc = line
        orig = line
        tok = line
        sents.append(SimpleSentence(n_bytes, count, text))
        if not (doc or orig or tok):
            break
        if doc != prev_doc:
            order = 0
        text = orig
        count += 1
        order += 1
        prev_doc = doc
    return sents
Exemplo n.º 21
0
def build_texts(path):
    '''
	Input: Path containing text files
	Why: Prepare corpus docs for gensim
	Output: 2 Lists: List of lists, 
		one for each doc,
		of comma-delimited words;
			List of lists,
		one for each doc, of comma-delimited
		sentences
	'''
    raw_texts = []
    processed_texts = []
    path = pathlib.Path(path)
    for file in get_files(path):
        text = build_one(file, path)
        raw_texts.append(split_sentences(text))
        processed_texts.append(
            gensim.utils.simple_preprocess(
                text, deacc=True, min_len=3))  #  preprocess=preprocess
    return raw_texts, processed_texts
Exemplo n.º 22
0
# вариант 2
text_tokenized = list(tokenize(text, lowercase=True))
# переменная, в которую будем складывать длину каждого слова
total_length = 0
# цикл, в котором считаем длину каждого отдельного слова для нахождения среднего значения
for word in text_normalized:
    total_length += len(word)
# записываем результат
writing(
    'Статистика по тексту.txt', 'w',
    '\t\tСтатистические данные по тексту "city-smells".\n\n1. Средняя длина '
    'слова в тексте: ' + str(round(total_length / len(text_normalized))) +
    ' символов;')

# 2. Смотрим среднюю длину предложения в тексте (я взяла функцию из модуля gensim):
sentence_list = list(split_sentences(text))
# из предыдущего задания знаем общее количество слов:
writing(
    'Статистика по тексту.txt', 'a',
    '\n2. Средняя длина предложения в тексте: ' +
    str(round(len(text_tokenized) / len(sentence_list))) + ' слов;')

# 3. Во сколько раз самое длинное предложение длиннее самого короткого:
# по символам:
sentence_sizes = []
for sentence in sentence_list:
    sentence_sizes.append(len(sentence))
writing(
    'Статистика по тексту.txt', 'a',
    '\n3. Cамое длинное предложение длиннее самого (по символам) короткого '
    'предложения в ' + str(max(sentence_sizes) // min(sentence_sizes)) +
Exemplo n.º 23
0
def to_sentences(book):
    sentences = textcleaner.split_sentences(book)
    sentence_tokens = [simple_preprocess(sentence) for sentence in sentences]
    return sentence_tokens
def tokenize_sentences(domain_concept, text):
    my_stop_words = ["a", "about", "above", "after", "again", "against", "ain", "all", "am", "an", "and", "any", "are", "aren",
     "aren't", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can",
     "couldn", "couldn't", "d", "did", "didn", "didn't", "do", "does", "doesn", "doesn't", "doing", "don", "don't",
     "down", "during", "each", "few", "for", "from", "further", "had", "hadn", "hadn't", "has", "hasn", "hasn't",
     "have", "haven", "haven't", "having", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "i",
     "if", "in", "into", "is", "isn", "isn't", "it", "it's", "its", "itself", "just", "ll", "m", "ma", "me", "mightn",
     "mightn't", "more", "most", "mustn", "mustn't", "my", "myself", "needn", "needn't", "no", "nor", "not", "now", "o",
     "of", "off", "on", "once", "only", "or", "other", "our", "ours", "ourselves", "out", "over", "own", "re", "s",
     "same", "shan", "shan't", "she", "she's", "should", "should've", "shouldn", "shouldn't", "so", "some", "such", "t",
     "than", "that", "that'll", "the", "their", "theirs", "them", "themselves", "then", "there", "these", "they",
     "this", "those", "through", "to", "too", "under", "until", "up", "ve", "very", "was", "wasn", "wasn't", "we",
     "were", "weren", "weren't", "what", "when", "where", "which", "while", "who", "whom", "why", "will", "with", "won",
     "won't", "wouldn", "wouldn't", "y", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself",
     "yourselves", "could", "he'd", "he'll", "he's", "here's", "how's", "i'd", "i'll", "i'm", "i've", "let's", "ought",
     "she'd", "she'll", "that's", "there's", "they'd", "they'll", "they're", "they've", "we'd", "we'll", "we're",
     "we've", "what's", "when's", "where's", "who's", "why's", "would", "able", "abst", "accordance", "according",
     "accordingly", "across", "act", "actually", "added", "adj", "affected", "affecting", "affects", "afterwards", "ah",
     "almost", "alone", "along", "already", "also", "although", "always", "among", "amongst", "announce", "another",
     "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "apparently",
     "approximately", "arent", "arise", "around", "aside", "ask", "asking", "auth", "available", "away", "awfully", "b",
     "back", "became", "become", "becomes", "becoming", "beforehand", "begin", "beginning", "beginnings", "begins",
     "behind", "believe", "beside", "besides", "beyond", "biol", "brief", "briefly", "c", "ca", "came", "cannot",
     "can't", "cause", "causes", "certain", "certainly", "co", "com", "come", "comes", "contain", "containing",
     "contains", "couldnt", "date", "different", "done", "downwards", "due", "e", "ed", "edu", "effect", "eg", "eight",
     "eighty", "either", "else", "elsewhere", "end", "ending", "enough", "especially", "et", "etc", "even", "ever",
     "every", "everybody", "everyone", "everything", "everywhere", "ex", "except", "f", "far", "ff", "fifth", "first",
     "five", "fix", "followed", "following", "follows", "former", "formerly", "forth", "found", "four", "furthermore",
     "g", "gave", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "gone", "got", "gotten",
     "h", "happens", "hardly", "hed", "hence", "hereafter", "hereby", "herein", "heres", "hereupon", "hes", "hi", "hid",
     "hither", "home", "howbeit", "however", "hundred", "id", "ie", "im", "immediate", "immediately", "importance",
     "important", "inc", "including", "indeed", "index", "information", "instead", "invention", "inward", "itd", "it'll", "j", "k",
     "keep", "keeps", "kept", "kg", "km", "know", "known", "knows", "l", "largely", "last", "lately", "later", "latter",
     "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "line", "little", "'ll", "look",
     "looking", "looks", "ltd", "made", "mainly", "make", "makes", "many", "may", "maybe", "mean", "means", "meantime",
     "meanwhile", "merely", "mg", "might", "million", "miss", "ml", "moreover", "mostly", "mr", "mrs", "much", "mug",
     "must", "n", "na", "name", "named", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need", "needs",
     "neither", "never", "nevertheless", "new", "next", "nine", "ninety", "nobody", "non", "none", "nonetheless",
     "noone", "normally", "nos", "noted", "nothing", "nowhere", "obtain", "obtained", "obviously", "often", "oh", "ok",
     "okay", "old", "omitted", "one", "ones", "onto", "ord", "others", "otherwise", "outside", "overall", "owing", "p",
     "page", "pages", "part", "particular", "particularly", "past", "per", "perhaps", "placed", "please", "plus",
     "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "previously", "primarily",
     "probably", "promptly", "proud", "provides", "put", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather",
     "rd", "readily", "really", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related",
     "relatively", "research", "respectively", "resulted", "resulting", "results", "right", "run", "said", "saw", "say",
     "saying", "says", "sec", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self",
     "selves", "sent", "seven", "several", "shall", "shed", "shes", "show", "showed", "shown", "showns", "shows",
     "significant", "significantly", "similar", "similarly", "since", "six", "slightly", "somebody", "somehow",
     "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry",
     "specifically", "specified", "specify", "specifying", "still", "stop", "strongly", "sub", "substantially",
     "successfully", "sufficiently", "suggest", "sup", "sure", "take", "taken", "taking", "tell", "tends", "th",
     "thank", "thanks", "thanx", "thats", "that've", "thence", "thereafter", "thereby", "thered", "therefore",
     "therein", "there'll", "thereof", "therere", "theres", "thereto", "thereupon", "there've", "theyd", "theyre",
     "think", "thou", "though", "thoughh", "thousand", "throug", "throughout", "thru", "thus", "til", "tip", "together",
     "took", "toward", "towards", "tried", "tries", "truly", "try", "trying", "ts", "twice", "two", "u", "un",
     "unfortunately", "unless", "unlike", "unlikely", "unto", "upon", "ups", "us", "use", "used", "useful", "usefully",
     "usefulness", "uses", "using", "usually", "v", "value", "various", "'ve", "via", "viz", "vol", "vols", "vs", "w",
     "want", "wants", "wasnt", "way", "wed", "welcome", "went", "werent", "whatever", "what'll", "whats", "whence",
     "whenever", "whereafter", "whereas", "whereby", "wherein", "wheres", "whereupon", "wherever", "whether", "whim",
     "whither", "whod", "whoever", "whole", "who'll", "whomever", "whos", "whose", "widely", "willing", "wish",
     "within", "without", "wont", "words", "world", "wouldnt", "www", "x", "yes", "yet", "youd", "youre", "z", "zero",
     "a's", "ain't", "allow", "allows", "apart", "appear", "appreciate", "appropriate", "associated", "best", "better",
     "c'mon", "c's", "cant", "changes", "clearly", "concerning", "consequently", "consider", "considering",
     "corresponding", "course", "currently", "definitely", "described", "despite", "entirely", "exactly", "example",
     "going", "greetings", "hello", "help", "hopefully", "ignored", "inasmuch", "indicate", "indicated", "indicates",
     "inner", "insofar", "it'd", "keep", "keeps", "novel", "presumably", "reasonably", "second", "secondly", "sensible",
     "serious", "seriously", "sure", "t's", "third", "thorough", "thoroughly", "three", "well", "wonder", "a", "about",
     "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along",
     "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
     "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became",
     "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside",
     "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co",
     "con", "could", "couldnt", "cry", "de", "Des", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg",
     "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
     "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for",
     "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had",
     "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
     "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest",
     "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many",
     "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must",
     "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none",
     "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto",
     "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
     "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she",
     "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something",
     "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their",
     "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon",
     "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru",
     "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until",
     "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever",
     "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while",
     "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet",
     "you", "your", "yours", "yourself", "yourselves", "the", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k",
     "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B", "C", "D", "E", "F", "G", "H",
     "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "co", "op",
     "research-articl", "pagecount", "cit", "ibid", "les", "le", "au", "que", "est", "pas", "vol", "el", "los", "pp",
     "u201d", "well-b", "http", "volumtype", "par", "0o", "0s", "3a", "3b", "3d", "6b", "6o", "a1", "a2", "a3", "a4",
     "ab", "ac", "ad", "ae", "af", "ag", "aj", "al", "an", "ao", "ap", "ar", "av", "aw", "ax", "ay", "az", "b1", "b2",
     "b3", "ba", "bc", "bd", "be", "bi", "bj", "bk", "bl", "bn", "bp", "br", "bs", "bt", "bu", "bx", "c1", "c2", "c3",
     "cc", "cd", "ce", "cf", "cg", "ch", "ci", "cj", "cl", "cm", "cn", "cp", "cq", "cr", "cs", "ct", "cu", "cv", "cx",
     "cy", "cz", "d2", "da", "dc", "dd", "de", "df", "di", "dj", "dk", "dl", "do", "dp", "dr", "ds", "dt", "du", "dx",
     "dy", "e2", "e3", "ea", "ec", "ed", "ee", "ef", "ei", "ej", "el", "em", "en", "eo", "ep", "eq", "er", "es", "et",
     "eu", "ev", "ex", "ey", "f2", "fa", "fc", "ff", "fi", "fj", "fl", "fn", "fo", "fr", "fs", "ft", "fu", "fy", "ga",
     "ge", "gi", "gj", "gl", "go", "gr", "gs", "gy", "h2", "h3", "hh", "hi", "hj", "ho", "hr", "hs", "hu", "hy", "i",
     "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ic", "ie", "ig", "ih", "ii", "ij", "il", "in", "io", "ip", "iq",
     "ir", "iv", "ix", "iy", "iz", "jj", "jr", "js", "jt", "ju", "ke", "kg", "kj", "km", "ko", "l2", "la", "lb", "lc",
     "lf", "lj", "ln", "lo", "lr", "ls", "lt", "m2", "ml", "mn", "mo", "ms", "mt", "mu", "n2", "nc", "nd", "ne", "ng",
     "ni", "nj", "nl", "nn", "nr", "ns", "nt", "ny", "oa", "ob", "oc", "od", "of", "og", "oi", "oj", "ol", "om", "on",
     "oo", "oq", "or", "os", "ot", "ou", "ow", "ox", "oz", "p1", "p2", "p3", "pc", "pd", "pe", "pf", "ph", "pi", "pj",
     "pk", "pl", "pm", "pn", "po", "pq", "pr", "ps", "pt", "pu", "py", "qj", "qu", "r2", "ra", "rc", "rd", "rf", "rh",
     "ri", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "rv", "ry", "s2", "sa", "sc", "sd", "se", "sf",
     "si", "sj", "sl", "sm", "sn", "sp", "sq", "sr", "ss", "st", "sy", "sz", "t1", "t2", "t3", "tb", "tc", "td", "te",
     "tf", "th", "ti", "tj", "tl", "tm", "tn", "tp", "tq", "tr", "ts", "tt", "tv", "tx", "ue", "ui", "uj", "uk", "um",
     "un", "uo", "ur", "ut", "va", "wa", "vd", "wi", "vj", "vo", "wo", "vq", "vt", "vu", "x1", "x2", "x3", "xf", "xi",
     "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y2", "yj", "yl", "yr", "ys", "yt", "zi", "zz"]

    for stop_word in my_stop_words:
        lexeme = nlp.vocab[stop_word]
        lexeme.is_stop = True

    sentences = []

    if not isinstance(text, float):
        text_sentences = textcleaner.split_sentences(text)
        cleaned_sentences = []

        for sent in text_sentences:
            cleaned_sentences.append(nlp(sent.lower()))

        for sentence in cleaned_sentences:
            sent = []
            for w in sentence:
                if (not w.is_stop) and (not w.is_punct) and (not w.like_num) and (not w.like_url) and \
                        ('\n' not in w.text) and (' ' not in w.text) and (len(w.text) > 1):
                    sent.append(w.text.strip())
            sentences.append(sent)
    else:
        logging.warning(domain_concept + ": does not have summary")
        pass

    return sentences
# create list of sentence tokens
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
sents_list

###############################################################################
#5. Tokenization using Keras

from keras.preprocessing.text import text_to_word_sequence

#Word Tokenization
result = text_to_word_sequence(text)
result

###############################################################################
#6. Tokenization using Gensim

from gensim.utils import tokenize

#Word Tokenization
list(tokenize(text))

#Sentence Tokenization
from gensim.summarization.textcleaner import split_sentences
result = split_sentences(text)
result

###############################################################################
###############################################################################
Exemplo n.º 26
0
        doc2vec_epochs = settings[
            'doc2vec_epochs'] if 'doc2vec_epochs' in settings.keys() else 20
        if 'doc2vec_epochs' not in settings.keys():
            logger.warning(
                'doc2vec epochs not in settings; using default value {}.'.
                format(doc2vec_epochs))
        else:
            logger.info('doc2vec epochs: {}'.format(doc2vec_epochs))

    with open(input_file, 'r') as input_fp:
        text = input_fp.read()
        text = text.split('\n')
        text = text[text_start:text_stop]
        text = ' '.join(text)
        logger.info('text length: {}'.format(len(text)))
        sentences = split_sentences(text)
        if pieces_strategy == pieces_strategies[0]:
            pieces = [
                text[i:i + context_limit_]
                for i in range(0, len(text), context_limit_)
            ] + [
                text[i + context_limit_ // 2:i + 3 * context_limit_ // 2]
                for i in range(0,
                               len(text) - context_limit_, context_limit_)
            ]
        elif pieces_strategy == pieces_strategies[1]:
            pieces = [
                ' '.join(sentences[index:index + sentences_per_chunk])
                for index in range(0, len(sentences), sentences_per_chunk)
            ]
        else:
Exemplo n.º 27
0
# In[7]:

logging.info(f' count tokens = {tokenize.all_count_token}')
logging.info(f' vocab size = {len(vocab)}')

# In[8]:

vocab_txt = 'data/clear_data/vocab.txt'
with open(vocab_txt, 'a') as f:
    for token, token_count in vocab:
        f.write(token + '\n')

# In[9]:

sentences_df = data.text.apply(lambda x: split_sentences(x))

# In[10]:

sep = int(len(sentences_df) * 0.8)

# In[11]:

train_sentences_txt = 'data/clear_data/train_sentences.txt'
heldout_sentences_txt = 'data/clear_data/heldout _sentences.txt'

with open(train_sentences_txt, 'a') as f:
    for sentence_list in sentences_df[:sep]:
        for sentence in sentence_list:
            f.write(sentence[0].lower() + sentence[1:] + '\n')
Exemplo n.º 28
0
 def summarize_text(self, text: str):
     sentences = split_sentences(text)
     if 'риа новости' in sentences[0]:
         return sentences[1]
     else:
         return sentences[0]
Exemplo n.º 29
0
# whether to run tests
test_acc = False

# currently available: nyt, washpo
source_name = 'nyt'
source_path = 'source_embeddings/' + source_name

data_source = 'nexis'

if not os.path.isfile(source_path) or force_retrain:
    with open('../data/%s.csv' % source_name) as f:
        reader = csv.reader(f)
        articles = [r[1] for r in reader]
    sentences = []
    for article in articles:
        art = split_sentences(article)
        sentences += [list(tokenize_by_word(sen)) for sen in art]
    bigram_transformer = Phrases(sentences)
    sentences = bigram_transformer[sentences]
    model = gensim.models.Word2Vec(sentences,
                                   size=100,
                                   window=10,
                                   min_count=2,
                                   workers=10)
    model.train(sentences, total_examples=len(sentences), epochs=50)
    model.save(source_path)
else:
    model = gensim.models.Word2Vec.load(source_path)

if test_acc:
    model.accuracy('questions-words.txt')
Exemplo n.º 30
0
                                         mode='w+')  # File created by code

print("read unprocessed text")
df = read_csv(custom_embedding_text_data, encoding='utf-8')

print('replace all nan with empty string')
df.replace(nan, '', regex=True, inplace=True)
print("drop all nan")
df = df.dropna().reset_index(drop=True)
df = DataFrame(df.text.unique(), columns=['text'])

print("remove tags from unprocessed text and write to temporary clean text")

for i in range(len(df)):

    DataFrame(split_sentences(remove_tags(df.iloc[i]['text'])), columns=['temp_clean'], dtype=str).\
                                                    to_csv('temporary_clean_text.csv', header=False, index=False, mode='a')

logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s",
                    datefmt='%H:%M:%S',
                    level=logging.INFO)

del df
gc.collect()

print("read temporary clean text to convert to lower case")
df_temp_clean = read_csv('temporary_clean_text.csv',
                         usecols=['temp_clean'],
                         dtype={'temp_clean': str},
                         lineterminator='\n')
brief_cleaning = (sub("[^A-Za-z]+", ' ', str(row)).lower()
Exemplo n.º 31
0
def splitToSentences(text):
    return textcleaner.split_sentences(text)