Пример #1
0
def get_idf_weighting_list(doc, sentence_count, N, lang):
    weights = []
    sentences = doc_to_sentence(doc, lang)
    for sentence in sentences:
        sent = sentence.strip()
        weights.append(1 + math.log((1.0 + N) / (1.0 + sentence_count[sent])))
    return weights
Пример #2
0
def get_senetence_frequencies(doc, word, lang):
    frequency = 0
    sentences = doc_to_sentence(doc, lang)
    for sentence in sentences:
        if (word in sentence):
            frequency += 1
    return frequency
Пример #3
0
def get_sentence_length_weighting_list(doc, lang):
    weight = []
    sentences = doc_to_sentence(doc, lang)
    for sentence in sentences:
        weight.append(get_sentence_count() * len(sentence.split()))
    total_tokens = float(sum(weight))

    return [x / total_tokens for x in weight]
Пример #4
0
def extract_digits(doc, lang):
    sentences = doc_to_sentence(doc, lang)
    digits = []
    for sentence in sentences:
        temp = re.findall(r'\d+', sentence)
        res = list(map(int, temp))
        digits += res
    return digits
Пример #5
0
def get_sentence_frequency_list(doc, lang):
    sentences = doc_to_sentence(doc, lang)
    length = len(sentences)
    weights = []
    for sent in sentences:
        frequency = sentences.count(sent)
        weights.append(frequency / length)

    return weights
Пример #6
0
def sentence_count_web_domain(documents, lang):
    sentence_count = {}
    for doc in documents:
        sentences = doc_to_sentence(doc, lang)
        for sentence in sentences:
            sent = sentence.strip()
            if (sent in sentence_count):
                sentence_count[sent] += 1
            else:
                sentence_count[sent] = 1
    return sentence_count
Пример #7
0
def get_inter_doc_word_idf_weighting_list(doc, word_count, N, lang):
    weights = []
    sentences = []
    sentences = doc_to_sentence(doc, lang)
    for sentence in sentences:
        weight = 0
        words = sentence_to_word(sentence, lang)

        for word in words:
            #weight+= (words.count(word)/len(words)) * (len(sentences)/get_senetence_frequencies(doc, word))
            weight += 1 + math.log((1.0 + N) / (1.0 + word_count[word]))
        weights.append(weight)
    return weights
Пример #8
0
def word_count_over_docs(documents, lang):
    word_count = {}
    for doc in documents:
        sentences = []
        sentences = doc_to_sentence(doc, lang)
        my_words = []
        for sentence in sentences:
            words = []
            if (lang == 'en'):
                words = sentence_to_word(sentence, "en")
            elif (lang == 'si'):
                words = sentence_to_word(sentence, "si")
            for word in words:
                if (word not in my_words):
                    my_words.append(word)

        for word in my_words:
            if (word in word_count):
                word_count[word] += 1
            else:
                word_count[word] = 1
    return word_count
Пример #9
0
def get_embeddig_list(doc, lang='en'):
    sentences = doc_to_sentence(doc, lang)
    embedding_list = sent_embedding(sentences, lang)

    return embedding_list