def text_cleaner(text):
    negations_dictionary = {
        "isn't": "is not",
        "aren't": "are not",
        "wasn't": "was not",
        "weren't": "were not",
        "haven't": "have not",
        "hasn't": "has not",
        "hadn't": "had not",
        "won't": "will not",
        "wouldn't": "would not",
        "don't": "do not",
        "doesn't": "does not",
        "didn't": "did not",
        "can't": "can not",
        "couldn't": "could not",
        "shouldn't": "should not",
        "mightn't": "might not",
        "mustn't": "must not"
    }
    negations_pattern = re.compile(r'\b(' +
                                   '|'.join(negations_dictionary.keys()) +
                                   r')\b')
    tokenizer = WordPunctTokenizer()
    processed_text = text.lower()
    negation_handled = negations_pattern.sub(
        lambda x: negations_dictionary[x.group()], processed_text)
    processed_text = re.sub("[^A-Za-z]", ' ', negation_handled)
    words = [x for x in tokenizer.tokenize(processed_text) if len(x) > 1]
    return words
示例#2
0
 def __init__(self, input_size, batch_size, path_to_write):
     self.word_vectors = FastText.load("D:\\Typing\\araneum_none_fasttextskipgram_300_5_2018.model")
     self.input_size = input_size
     self.tokenizer = WordPunctTokenizer()
     self.batch_size = batch_size
     self.path = path_to_write
     self.punctuations = ['.', ',', '-', '\'', '\"', '!', '?', '(', ')', ':', ';']
示例#3
0
def filter_stop_words(text, stop_words):
    wpt = WordPunctTokenizer()
    tokenized_words = wpt.tokenize(text)
    processed_words = [word for word in tokenized_words if not word in stop_words]
    text = ' '.join([str(word) for word in processed_words])
    
    return text
示例#4
0
def clean_tweet(tweet):
    link_removed = re.sub('https?://[A-Za-z0-9./]+', '', tweet)
    number_removed = re.sub('[^a-zA-Z]', ' ', link_removed)
    lower_case_tweet = number_removed.lower()
    tok = WordPunctTokenizer()
    words = tok.tokenize(lower_case_tweet)
    clean_tweet = (' '.join(words)).strip()
    return clean_tweet
def sentence2words(sentence):
    result = []
    word_punct_tokenizer = WordPunctTokenizer()
    words = word_punct_tokenizer.tokenize(sentence)
    stemmer = nltk.stem.SnowballStemmer('english')
    for word in words:
        ori_word = stemmer.stem(word)
        result.append(ori_word)
    return result
示例#6
0
def load_task2(articles_path, labels_path, tokenizer='punct'):
    file_names, labels, spans = get_class_labels(labels_path)
    corpus = load_data(articles_path)
    tknz = WordPunctTokenizer()
    samples = []
    for span, file_name in zip(spans, file_names):
        article = corpus[file_name]
        tokenized_span = tknz.tokenize(article[span[0]:span[1]])
        samples.append(tokenized_span)
    return samples, labels, spans, file_names
示例#7
0
    def _tokenize(self, doc):
        all_tokens = []
        sentences = sent_tokenize(doc)

        tokenizer = WordPunctTokenizer()
        for sentence in sentences:
            words = tokenizer.tokenize(sentence.lower())
            words = [word for word in words if word not in punctuation]
            all_tokens.extend(words)
        return all_tokens
def _sentence_tok(delex_texts: List[str]) -> List[List[List[str]]]:
    #tokenize the texts
    sentence_tok_texts = []
    tknzr = WordPunctTokenizer()
    for text in delex_texts:
        sentences = sent_tokenize(text)
        tok_sentences = []
        for sentence in sentences:
            tok_sentences.append(tknzr.tokenize(sentence))
        sentence_tok_texts.append(tok_sentences)

    return sentence_tok_texts
示例#9
0
def stemming_words(text):    
    wpt = WordPunctTokenizer()
    words = wpt.tokenize(text)
    
    turkishStemmer = TurkishStemmer()
    
    stemmed_words = []
    for word in words:
        stemmed_words.append(turkishStemmer.stemWord(word))
    text = ' '.join([str(word) for word in stemmed_words])  
    
#     print (stemmed_words)
    
    return text 
示例#10
0
 def stemming_words(self, text):
     wpt = WordPunctTokenizer()
     words = wpt.tokenize(text)
     turkishStemmer = TurkishStemmer()
     stemmed_words = []
     for word in words:
         stemmed_words.append(turkishStemmer.stemWord(word))
         # try:
         #     # stemmed_words.append(turkishStemmer.stemWord(word))
         #     stemmed_words.append(word[0:5])
         # except:
         #     # stemmed_words.append(turkishStemmer.stemWord(word))
         #     stemmed_words.append(word)
     text = ' '.join([str(word) for word in stemmed_words])
     return text
示例#11
0
class PunctTokenizer(object):
    def __init__(self,
                 lower=True,
                 prepend_cls=False,
                 prepend_bos=False,
                 append_eos=False,
                 stopwords=None,
                 specials=SPECIAL_TOKENS):
        self.lower = lower
        self.specials = SPECIAL_TOKENS
        self.pre_id = []
        self.post_id = []
        self.stopwords = stopwords
        if prepend_cls and prepend_bos:
            raise ValueError("prepend_bos and prepend_cls are"
                             " mutually exclusive")
        if prepend_cls:
            self.pre_id.append(self.specials.CLS.value)
        if prepend_bos:
            self.pre_id.append(self.specials.BOS.value)
        if append_eos:
            self.post_id.append(self.specials.EOS.value)
        self.punct = WordPunctTokenizer()

    def __call__(self, x):
        if self.lower:
            x = x.lower()

        x = (self.pre_id + self.punct.tokenize(x) + self.post_id)
        if self.stopwords:
            x = [w for w in x if w not in self.stopwords]
        return x
示例#12
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.inside_dd = False
     self.bgrams = {}
     self.sorted_bgrams = []
     self.tokenizer = WordPunctTokenizer()
     self.token_count = 0
示例#13
0
def setupTextPreProcess():
    #import stopwords
    global stopWords
    global tokenizer
    global stemmer

    #save some space
    stopWords = [
        'wouldn', 'which', "it's", "didn't", 'her', 'she', "hasn't", "you've",
        'some', 'more', 'is', 'couldn', 'hers', 'no', 'so', 'ours', 'd',
        "shouldn't", 'off', 'was', 'about', "mightn't", 'own', 'above', 'most',
        'out', 'you', 'just', "hadn't", 'haven', 'o', 'once', 'shouldn', 'few',
        'y', 've', 'your', 'ourselves', 'why', 'does', 'me', 'at', 'mustn',
        "you'd", 'then', 'weren', 're', 'below', 'should', 'doesn', 'when',
        "she's", 'can', 'of', 'that', "weren't", 'after', 'any', "should've",
        'wasn', 't', 'very', 'these', 'yourself', 'nor', 'my', 'won', 'his',
        'both', 'same', 'yours', 'only', 'but', 'our', 'it', 'has', 'be',
        "wasn't", 'herself', 'its', 'mightn', "needn't", 'as', 'am', 'are',
        'they', 'their', 'doing', 'during', 'again', 'all', "doesn't",
        'between', 'under', 'over', "that'll", "haven't", 'this', 'will',
        'until', "couldn't", 'had', 'myself', 'because', 'than', "won't", 'he',
        'have', 'with', 'don', 'hadn', 'who', 'up', "wouldn't", 'ain', 'them',
        'i', 'through', 'aren', 'here', 'themselves', "shan't", 'if', 'what',
        'ma', 'yourselves', 'theirs', 'a', 'against', 'being', 'to', 'where',
        'on', 'having', 'himself', 'each', "you'll", "don't", 'further',
        "isn't", 'while', 'in', 'how', 'such', 'now', 'from', 'needn', 'there',
        'hasn', 'too', 'm', 'or', 'not', 'didn', 'whom', 'down', 'shan', 'll',
        "you're", 's', "mustn't", 'him', 'were', 'an', 'we', "aren't", 'by',
        'been', 'the', 'itself', 'before', 'did', 'into', 'and', 'for', 'do',
        'other', 'those', 'isn'
    ]

    tokenizer = WordPunctTokenizer()
    stemmer = PorterStemmer()
def pre_process(data):
    clean_reviews = []

    # 需要移除的无用符号
    rmSignal = ['.', '#', '$', '?', '!', '%', ':/', ':', '-', '+', '/', '"']

    for comment_content in data['review']:
        texts = WordPunctTokenizer().tokenize(comment_content)  # NLTK分词

        text = [word.lower() for word in texts]
        for word in text:
            if word in rmSignal:  # 去掉符号
                text.remove(word)
                continue

            if word.isdigit():  # 移除数字
                text.remove(word)

            if re.search("[(.|…)]+", word):  # 移除分词不能识别的省略号
                text.remove(word)

        new_sentence = (" ".join(text))  # change to string
        clean_reviews.append(new_sentence)

    return clean_reviews
示例#15
0
    def __init__(self):
        """Set up map."""
        self.word_tokenizer = WordPunctTokenizer()

        filename = join(split(__file__)[0], 'data', 'compounds.txt')

        self.decompound_map = {}
        with open(filename, encoding='utf-8') as fid:
            for line in fid:
                parts = line.strip().split('|')
                compound = "".join(parts)
                decompounded_parts = [
                    part for part in parts if part != 's' and part != 'e'
                ]
                decompounded = " ".join(decompounded_parts)
                self.decompound_map[compound] = decompounded
示例#16
0
class NewsgroupsReader(object):
    def __init__(self, tokenize):
        self._tokenize = tokenize
        self._tokenizer = WordPunctTokenizer()

    def get_training(self):
        return self._get_docs('datasets/20news-bydate-train')

    def get_test(self):
        return self._get_docs('datasets/20news-bydate-test')

    def _get_docs(self, path):
        doc_objects = []
        i = 0

        for category in listdir(path):
            for f in listdir(path + "/" + category):
                with codecs.open(path + "/" + category + "/" + f, 'r', encoding='latin1') as content_file:
                    text = content_file.read()
                    tokens = self._tokenizer.tokenize(text) if self._tokenize else text
                    doc_objects.append(Document(i, tokens, category))
                    i += 1

        random.shuffle(doc_objects)
        return doc_objects
示例#17
0
def nGrams(string,corpus,number,clean=True):
    global wordList
    biList=[]
    triList=[]
    words = WordPunctTokenizer().tokenize(string)
    stopset = set(stopwords.words('english'))
    if clean == True:
        words = [word.lower() for word in words]
    if clean == False:
        words = [word.lower() for word in words]
    filter = lambda words: len(words) < 2 or words.isdigit()
    
    bcf = BigramCollocationFinder.from_words(words)
    bcf.apply_word_filter(filter)
    biResult = bcf.nbest(BigramAssocMeasures.likelihood_ratio, number)

    tcf = TrigramCollocationFinder.from_words(words)
    tcf.apply_word_filter(filter)
    triResult = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, number)

    for i in range(len(biResult)):
        if len(biResult) > 0:
            biPrint = " ".join(biResult[i])
            biList.append(biPrint)
        else:
            biList=[]
    csv = open('db\cyttron-keywords.csv','a')            
    if len(biList) > 1:
        csv.write('"' + ','.join(biList[:-1]) + ',' + biList[-1] + '";')
    else:
        csv.write('"' + ''.join(biList) + '";')
    csv.close()
    
    for i in range(len(triResult)):
        if len(triResult) > 0:
            triPrint = " ".join(triResult[i])
            triList.append(triPrint)
        else:
            triList=[]
    csv = open('db\cyttron-keywords.csv','a')
    if len(triList) > 1:
        csv.write('"' + ','.join(triList[:-1]) + ',' + triList[-1] + '"\n')
    else:
        csv.write('"' + ''.join(triList) + '"\n')
    csv.close()
    print biList
    print triList
示例#18
0
def nGrams(string, corpus, number, clean=True):
    global wordList
    biList = []
    triList = []
    words = WordPunctTokenizer().tokenize(string)
    stopset = set(stopwords.words('english'))
    if clean == True:
        words = [word.lower() for word in words]
    if clean == False:
        words = [word.lower() for word in words]
    filter = lambda words: len(words) < 2 or words.isdigit()

    bcf = BigramCollocationFinder.from_words(words)
    bcf.apply_word_filter(filter)
    biResult = bcf.nbest(BigramAssocMeasures.likelihood_ratio, number)

    tcf = TrigramCollocationFinder.from_words(words)
    tcf.apply_word_filter(filter)
    triResult = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, number)

    for i in range(len(biResult)):
        if len(biResult) > 0:
            biPrint = " ".join(biResult[i])
            biList.append(biPrint)
        else:
            biList = []
    csv = open('db\cyttron-keywords.csv', 'a')
    if len(biList) > 1:
        csv.write('"' + ','.join(biList[:-1]) + ',' + biList[-1] + '";')
    else:
        csv.write('"' + ''.join(biList) + '";')
    csv.close()

    for i in range(len(triResult)):
        if len(triResult) > 0:
            triPrint = " ".join(triResult[i])
            triList.append(triPrint)
        else:
            triList = []
    csv = open('db\cyttron-keywords.csv', 'a')
    if len(triList) > 1:
        csv.write('"' + ','.join(triList[:-1]) + ',' + triList[-1] + '"\n')
    else:
        csv.write('"' + ''.join(triList) + '"\n')
    csv.close()
    print biList
    print triList
示例#19
0
    def query(self, sent):

        sent = Query.decorate_sent(sent)
        elements = WordPunctTokenizer().tokenize(sent)
        post = self.to_post(elements)
        print(post)
        result = self.calculate_post(post)
        return result
def get_tense_verb_groups(text, verbose=False):
    tokens = WordPunctTokenizer().tokenize(text)
    if verbose:
        print("*************", len(tokens))
    tokens_idx = list(WordPunctTokenizer().span_tokenize(text))
    tokens = nltk.pos_tag(tokens)
    tokens = manual_pos_correction(tokens)
    if verbose:
        print("================", len(tokens), len(tokens_idx))
        print("tokens: ", tokens)
    verb_groups = get_verbs(tokens, tokens_idx)
    if verbose:
        print("verb_groups: ", verb_groups)
    tense_verb_groups = []
    for verb_group in verb_groups:
        tense_verb_group = get_tense(verb_group, verbose=verbose)
        tense_verb_groups.append(tense_verb_group)
    return tense_verb_groups
示例#21
0
 def __init__(self):
     HTMLParser.__init__(self)
     self.inside_dd = False
     self.doc_id = 0
     self.token_count = 0
     self.token_sum_len = 0      
     self.iindex = {}
     self.paragraphs = []
     self.tokenizer = WordPunctTokenizer()
     self.stemmer = RussianStemmer()
示例#22
0
def freqNouns(string, corpus, number):
    list = []
    words = WordPunctTokenizer().tokenize(string)
    pos = nltk.pos_tag(words)
    for i in range(len(pos)):
        if len(pos[i][0]) > 1:
            if pos[i][1] == 'NN' or pos[i][1] == 'NNP':
                list.append(pos[i][0])
    newString = ' '.join(list).lower()
    freqWords(newString, corpus, number)
示例#23
0
def cleanDoc(doc):
    # String goes in, list of words comes out
    global stopset
    stemmer = nltk.PorterStemmer()
    tokens = WordPunctTokenizer().tokenize(doc)
    clean = [
        token.lower() for token in tokens if token.lower() not in stopset
        and len(token) > 1 and token.isalnum() is True
    ]
    final = [stemmer.stem(word) for word in clean]
    return final
示例#24
0
def stemOnto(ontolist):
    stemmer = nltk.PorterStemmer()
    templist = []
    for i in range(len(ontolist)):
        templist = []
        tokens = WordPunctTokenizer().tokenize(ontolist[i][0])
        for j in range(len(tokens)):
            stem = stemmer.stem(tokens[j])
            templist.append(stem)
        ontolist[i][0] = ' '.join(templist)
    print "Stemmed", len(ontolist), "things"
示例#25
0
def stemList(sourceList):
    stemmer = nltk.PorterStemmer()
    templist = []
    for i in range(len(sourceList)):
        templist = []
        tokens = WordPunctTokenizer().tokenize(sourceList[i])
        for j in range(len(tokens)):
            stem = stemmer.stem(tokens[j])
            templist.append(stem)
        sourceList[i] = ' '.join(templist)
        print sourceList[i]
    print "Stemmed", len(sourceList), "texts"
示例#26
0
def get_average_embedding(embedding, review):
    """
    returns a list of word vectors for all words in review
    then average them to return a final vector

    :param embedding: embedding object - will be either Fasttext or Word2Vec
    :param review: review text
    :return:
    """
    log.debug(f'Getting average embedding for: [{review}]')

    wpt = WordPunctTokenizer()
    # word_vectors = [embedding.wv.get_vector(word) for word in wpt.tokenize(review)]
    word_vectors = [embedding.wv.get_vector(word) for word in wpt.tokenize(review) if word in embedding.wv.vocab]
    log.debug(f'word_vector shape [{np.shape(word_vectors)}]')

    # return average all word vectors to come up with final vector for the review
    # since we are using pre-trained embedding, we may not be able to find all the words
    if np.shape(word_vectors)[0] > 1:
        return np.average(word_vectors, axis=0)
    return None
示例#27
0
    def __init__(self,
                 thesaurus,
                 need_deeppavlov=True,
                 deeppavlov_model=None,
                 need_syntax=True,
                 syntax_model=None):
        self.need_deeppavlov = need_deeppavlov

        if need_deeppavlov:
            self.deeppavlov_lemma = deeppavlov_model if deeppavlov_model else build_model(
                configs.morpho_tagger.BERT.morpho_ru_syntagrus_bert,
                download=False)

        if need_syntax:
            self.syntax_model = syntax_model if syntax_model else build_model(
                configs.syntax.syntax_ru_syntagrus_bert, download=False)
        else:
            self.syntax_model = None

        self.tokenizer = WordPunctTokenizer()
        self.thesaurus = thesaurus
示例#28
0
def choose_longest(mention):
    if len(mention) == 0:
        return "UNK"
    else:
        mention = [
            " ".join(
                list(WordPunctTokenizer().tokenize(re.sub('[^ ]- ', '',
                                                          item))))
            for item in mention
        ]
        idx = np.argmax(np.asarray([len(a.strip().split()) for a in mention]))
        return mention[idx]
示例#29
0
def preprocess(ftrain, ftest):
    train_data = []
    test_data = []
    word_list = []
    word_set = set()
    ftrain.readline()
    ftest.readline()
    sw = []
    f = open("D:/input/stopwords.txt")
    for line in f.readlines():
        line = line.split('\n')[0]
        sw.append(line)
    for line in ftrain.readlines():
        ls = line.split('\t')
        ls[2].lower()
        word_list = WordPunctTokenizer().tokenize(ls[2])
        train_data.append([ls[0], int(ls[1]), set(word_list)])
        for word in word_list:
            word_set.add(word)
    print("Training data processed")
    for line in ftest.readlines():
        ls = line.split('\t')
        word_list = WordPunctTokenizer().tokenize(ls[1])
        for word in word_list:
            if word in sw:
                word_list.remove(word)
        test_data.append([ls[0], 0, set(word_list)])
    print("Testing data processed")
    word_set = word_set - (word_set & set(sw))
    return train_data, test_data, word_set
示例#30
0
 def __init__(self,
              lower=True,
              prepend_cls=False,
              prepend_bos=False,
              append_eos=False,
              stopwords=None,
              specials=SPECIAL_TOKENS):
     self.lower = lower
     self.specials = SPECIAL_TOKENS
     self.pre_id = []
     self.post_id = []
     self.stopwords = stopwords
     if prepend_cls and prepend_bos:
         raise ValueError("prepend_bos and prepend_cls are"
                          " mutually exclusive")
     if prepend_cls:
         self.pre_id.append(self.specials.CLS.value)
     if prepend_bos:
         self.pre_id.append(self.specials.BOS.value)
     if append_eos:
         self.post_id.append(self.specials.EOS.value)
     self.punct = WordPunctTokenizer()
示例#31
0
def wordNetWordMatch(string):
    newString = ""
    string = WordPunctTokenizer().tokenize(string)
    for i in range(len(string)):
        currentWord = string[i].lower()
        synonyms = []
        for syn in wordnet.synsets(currentWord):
            for lemma in syn.lemmas:
                synonyms.append(str(lemma.name).replace('_', ' ').lower())
        synonyms = set(synonyms)
        word = ', '.join(synonyms)
        newString += word
    wordMatch(newString)
示例#32
0
 def remove_stop_words(self, sentence):
     """
     remove stop_words of sentence
     :param sentence:
     :return:
     """
     words = WordPunctTokenizer().tokenize(sentence)
     st = stopwords.words('english')
     str_list = []
     for token in words:
         if token not in st:
             str_list.append(token)
     return " ".join(str_list)
class CustomTokenizer:
    def __init__(self, unicode_to_ascii=True, punct_one_token_per_char=True):
        self.unicode_to_ascii = unicode_to_ascii
        self.punct_one_token_per_char = punct_one_token_per_char

        self._re_punct = re.compile("(\p{P})")
        self._tokenizer = WordPunctTokenizer()

    def tokenize(self, text):
        if self.unicode_to_ascii:
            text = unidecode(text)
        if self.punct_one_token_per_char:
            text = re.sub(self._re_punct, "\\1 ", text)
        return self._tokenizer.tokenize(text)
示例#34
0
def label_data(corpus, corpus_labels, tokenizer='punct'):
    samples, labels, spans, file_names = [], [], [], []
    tknz = tree_bank_tokenizer(
    ) if tokenizer == 'treebank' else WordPunctTokenizer()
    print('Preprocessing')
    regex = get_regex()
    for article_name in corpus:
        article_propaganda_spans = iter([])
        if corpus_labels:
            article_propaganda_spans = iter(
                sorted(corpus_labels[article_name]
                       )) if article_name in corpus_labels else iter([])
        propaganda_span = next(article_propaganda_spans, [])
        split_article = corpus[article_name].split('\n')
        index_offset = 0
        first_in_span = True
        for line in split_article:
            if not line:
                index_offset += 1
                continue
            line = re.sub(regex, lambda sent: '~' * len(sent.group()), line)
            line = sub_unicode_chars(line)
            tokenized_line = tknz.tokenize(line)
            span_tokenized_line = tknz.span_tokenize(line)
            line_spans, line_labels = [], []
            for _ in tokenized_line:
                span_wo_offset = next(span_tokenized_line)
                current_span = (span_wo_offset[0] + index_offset,
                                span_wo_offset[1] + index_offset)
                if propaganda_span and current_span[0] >= propaganda_span[1]:
                    propaganda_span = next(article_propaganda_spans, [])
                    first_in_span = True
                if propaganda_span and propaganda_span[0] <= current_span[
                        0] < propaganda_span[1]:
                    line_labels.append(1 if first_in_span else 2)
                    first_in_span = False
                else:
                    line_labels.append(0)
                line_spans.append(current_span)
            assert len(tokenized_line) == len(line_labels) == len(
                line_spans
            ), 'Number of tokens is not equal to the number of spans or labels'
            labels.append(line_labels)
            spans.append(line_spans)
            samples.append(tokenized_line)
            file_names.append(article_name)
            index_offset += len(line) + 1
    print('Preprocessing done')
    return samples, labels, spans, file_names
示例#35
0
class TolstojParser(HTMLParser):

    def __init__(self):
        HTMLParser.__init__(self)
        self.inside_dd = False
        self.bgrams = {}
        self.sorted_bgrams = []
        self.tokenizer = WordPunctTokenizer()
        self.token_count = 0


    def handle_starttag(self, tag, attrs):
        if tag == "dd":
            self.inside_dd = True
        else:
            self.inside_dd = False


    def handle_data(self, data):
        if self.inside_dd:
            tokens = self.tokenizer.tokenize(unicode(data, 'utf-8').lower())
            for t1, t2 in itertools.izip(tokens, tokens[1:]):
                self.token_count += 1

                if (t1[0] in string.punctuation) or (t2[0] in string.punctuation):
                    continue

                key = t1.encode('utf-8') + ' ' + t2.encode('utf-8')
                if self.bgrams.has_key(key):
                    self.bgrams[key] += 1
                else:
                    self.bgrams[key] = 1

    def dump_bgrams(self, output_name):
        output = open(output_name, 'wb')
        pickle.dump(self.bgrams, output)
        output.close()


    def make_sorted_bgrams(self):
        self.sorted_bgrams = sorted(self.bgrams.items(), key=lambda x: x[1], reverse=True)

    def print_sorted_bgrams(self):
        for key, count in self.sorted_bgrams:           
            print key, count
示例#36
0
from gensim import corpora, models, similarities
from nltk import WordPunctTokenizer
import re

NUM_TOPICS = 40

stopwords = open('stopwords.txt').read().split('\n')
word_re = re.compile('[a-z0-9\s]+')
tokenizer = WordPunctTokenizer()
tokenize = lambda text: [w.lower()
                         for w in tokenizer.tokenize(text)
                         if re.match(word_re, w.lower()) and w.lower() not in stopwords]

id2word = corpora.Dictionary.load('dictionary.dict')
mm = corpora.MmCorpus('tfidf.mm')
lsi = models.lsimodel.LsiModel(corpus=mm, id2word=id2word, num_topics=NUM_TOPICS)
dic = corpora.Dictionary.load('dictionary.dict')

def get_topics(text, num, model=lsi):
    """ get +num+ topics for text +text+ """
    topics = []

    for t in sorted(model[dic.doc2bow(tokenize(text))],
                    key=lambda t: t[1],
                    reverse=True)[:num]:

        topics.append([u[1] for u in lsi.show_topic(t[0])])

    return topics
示例#37
0
 def __init__(self, tokenize):
     self._tokenize = tokenize
     self._tokenizer = WordPunctTokenizer()
示例#38
0
class KareninaParser(HTMLParser):

    def __init__(self):
        HTMLParser.__init__(self)
        self.inside_dd = False
        self.doc_id = 0
        self.token_count = 0
        self.token_sum_len = 0      
        self.iindex = {}
        self.paragraphs = []
        self.tokenizer = WordPunctTokenizer()
        self.stemmer = RussianStemmer()


    def handle_starttag(self, tag, attrs):
        if tag == "dd":
            self.inside_dd = True
            self.doc_id += 1
        else:
           self.inside_dd = False


    def handle_data(self, data):
        if self.inside_dd:
            self.paragraphs.append(data)
            terms = set()
            for token in self.tokenizer.tokenize(unicode(data.lower(), 'utf-8')):
                if token[0] in string.punctuation:
                    continue

                self.token_count += 1
                self.token_sum_len += len(token)                   

                term = self.stemmer.stem(token)                  

                if not term in terms:
                    terms.add(term)
                    if self.iindex.has_key(term):
                        self.iindex[term].append(self.doc_id)
                    else:
                        self.iindex[term] = [ self.doc_id ]


    def dump_iindex(self, output_name):
        output = open(output_name, 'wb')
        pickle.dump(self.iindex, output)
        output.close()


    def dump_paragraphs(self, output_name):
        output = open(output_name, 'wb')
        pickle.dump(self.paragraphs, output)
        output.close()


    def get_stat(self):
        term_sum_len = 0
        for term in self.iindex.keys():
            term_sum_len += len(term)

        term_count = len(self.iindex.keys())
        
        if not (term_count and self.token_count):
            self.stat = {}

        else:
            self.stat = {
                'token_count': self.token_count,
                'token_avg_len': self.token_sum_len/float(self.token_count),
                'term_count': term_count,
                'term_avg_len': term_sum_len/float(term_count)
            }

        return self.stat


    def print_iindex(self):
        for term in sorted(self.iindex.keys()):
            posting_list = self.iindex[term]
            print term
            print len(posting_list)
            print posting_list
            print '---------------------'