Exemplo n.º 1
0
 def similarity(self, context, relation):
     try:
         relation_label = self.prop_map[relation]
         txt_in_brackets = re.findall(r'\(.*\)', relation_label)
         for txt in txt_in_brackets:
             relation_label = relation_label.replace(txt, '').strip()
     except:
         return 0.0
     tokenizer = WordPunctTokenizer()
     rel_tokens_size = len(tokenizer.tokenize(relation_label.lower()))
     context_tokens = tokenizer.tokenize(context.lower())
     if rel_tokens_size > 1:
         context_ngram_tokens = [
             context_tokens[i:i + rel_tokens_size]
             for i in range(len(context_tokens) - rel_tokens_size + 1)
         ]
         context_tokens = context_ngram_tokens
     max_similarity = 0
     for c_token in context_tokens:
         sim = fuzz.token_sort_ratio(relation_label, c_token)
         if sim > max_similarity:
             max_similarity = sim
     if max_similarity == 100:
         return max_similarity * rel_tokens_size / 100
     return 0
Exemplo n.º 2
0
def tfIdf():
	TFIDF_MIN_SCORE = 100
	import nltk
	from nltk.tokenize import WordPunctTokenizer
	tokenizer = WordPunctTokenizer()		
	collection = initialize_collection('documents')

	docs = collection.find()
	tfidf = []
	idfMap = create_idf_map()
	docs = collection.find()
	for d in docs:
		tfMap = {}
		for word in set(tokenizer.tokenize(d['content'].lower())):
		 	if word not in tfMap:
		 		tfMap[word] = 1
		 	else:
		 		tfMap[word] += 1
		tfIdfValues = []
		for word in set(tokenizer.tokenize(d['content'].lower())):
			if (tfMap[word] * 1000 / idfMap[word]) > TFIDF_MIN_SCORE:
				tfIdfValues.append((word, tfMap[word] * 1000 / idfMap[word]))
		tfIdfValues = sorted(tfIdfValues, key = lambda x : x[1], reverse = True)
		d['tfidf'] = tfIdfValues
		tfidf.append({'d' : d,
					  'tfidf' : tfIdfValues})
		collection.save(d)


	genFreq = generaral_frequency(idfMap)
	return render_template("tfidf.html", documents = tfidf)
Exemplo n.º 3
0
def eng_seg():
    #基于标点符号的分词
    from nltk.tokenize import WordPunctTokenizer
    tokenizer = WordPunctTokenizer()
    print(tokenizer.tokenize("don't do that!"))
    path = "D:\\nlp语料\\机器翻译语料\\english.raw.sample.txt"
    f = open(path, "r")
    '''
    text = f.read()
    splChars = set()
    for ch in text:
        if (ch >= 'a' and ch <= 'z') or  (ch >= 'A' and ch <= 'Z'):
            pass
        else:
            splChars.add(ch)
    
    print(splChars)
    '''

    lines = f.readlines()
    print(len(lines))
    line_tokenized = []
    split_char = " "
    for line in lines:
        line_tokenized.append(split_char.join(tokenizer.tokenize(line)))
    f2 = open("D:\\nlp语料\\机器翻译语料\\english.raw.sample.seg.txt", "w")
    for line in line_tokenized:
        f2.write(line + "\n")
    f.close()
    f2.close()
Exemplo n.º 4
0
def class1():
	import nltk
	from nltk.tokenize import WordPunctTokenizer
	docId = request.args.get('d')
	tokenizer = WordPunctTokenizer()		
	collection = initialize_collection('documents')

	featuresets = []
	tagSet = set()
	for d in collection.find():	
		bagOfWords = bag_of_words(tokenizer.tokenize(d['content']))
		if 'tags' not in d: continue
		for tag in d['tags']:
			featuresets.append((bagOfWords, tag))
			tagSet.add(tag)
	classifier = nltk.NaiveBayesClassifier.train(featuresets)

	d = collection.find_one({'_id' : ObjectId(docId)})

	#classifier.show_most_informative_features(100)
	cl = classifier.prob_classify(bag_of_words(tokenizer.tokenize(d['content'])))
	probs = []
	for tag in tagSet:
		probs.append((tag, round(cl.prob(tag)*100) ))
	classifier.show_most_informative_features(n=20)
	probs = sorted(probs, key = lambda x : x[1],  reverse = True)
	return render_template('class1.html', probs = probs, d=d)
Exemplo n.º 5
0
class AbstractStemmer(object):
    def __init__(self):
        super(AbstractStemmer, self).__init__()
        self.tokenizer = WordPunctTokenizer()
        self.vocab = set()
        self.basename = 'nostemmer'

    def stem_query(self, q):
        # isword = re.compile('[a-z0-9]+')
        q = utils.clean(q)
        curr_words = self.tokenizer.tokenize(q)
        clean_words = [word.lower() for word in curr_words]
        processed_words = self.process(clean_words)
        self.vocab.update(processed_words)
        return ' '.join(processed_words)

    def stem(self, files):
        # We write files to a -[stemmer].txt file
        filename_mod = files[0].split('.')[0]
        wf = codecs.open('{1}-{0}.txt'.format(self.basename, filename_mod),
                         'w',
                         encoding='utf-8')
        isword = re.compile('[a-z0-9]+')

        # We can work with both gzip and non-gzip
        for fname in files:
            if fname.endswith('gz'):
                f = gzip.open(fname, 'r')
            else:
                f = open(fname)
            for no, line in enumerate(f):
                if isinstance(line, bytes):
                    line = line.decode('utf-8')
                # We drop empty lines
                if len(line.strip()) == 0:
                    continue

                # Clean and process words
                curr_words = self.tokenizer.tokenize(line)
                clean_words = [word.lower() for word in curr_words]
                processed_words = self.process(clean_words)

                # Keep track of vocab size
                self.vocab.update(processed_words)

                # We output according to the one-doc-per-line format for Mallet
                current_line = u' '.join(processed_words)
                line_fmt = '{0}\n'.format(current_line)
                wf.write(line_fmt)
            f.close()

        print('Resulting vocab size: {0}'.format(len(self.vocab)))
        wf.close()

    def process(self, words):
        raise NotImplementedError("No stemmer here!")
Exemplo n.º 6
0
def build_word_dictionary(input_file_name, output_file_name):
    dictionary = Counter()
    tokenizer = WordPunctTokenizer()
    with open(input_file_name) as input_file:
        for record in json.loads(input_file.read()):
            dictionary.update(tokenizer.tokenize(record['content']))
            dictionary.update(tokenizer.tokenize(record['abstract']))

    dictionary = list(sorted(w for w in dictionary if dictionary[w] >= 5)) + ['PADDING', 'UNKNOWN']

    with open(output_file_name, 'w') as output_file:
        output_file.write("{}\n".format(json.dumps(dictionary)))
Exemplo n.º 7
0
def build_word_dictionary(input_file_name, output_file_name):
    dictionary = Counter()
    tokenizer = WordPunctTokenizer()
    with open(input_file_name) as input_file:
        for record in json.loads(input_file.read()):
            dictionary.update(tokenizer.tokenize(record['content']))
            dictionary.update(tokenizer.tokenize(record['abstract']))

    dictionary = list(sorted(
        w for w in dictionary if dictionary[w] >= 5)) + ['PADDING', 'UNKNOWN']

    with open(output_file_name, 'w') as output_file:
        output_file.write("{}\n".format(json.dumps(dictionary)))
Exemplo n.º 8
0
 def w2v_training_sents(self, dataList, trainID):
     word_punct_tokenizer = WordPunctTokenizer()
     x = []
     for currentId in trainID:
         currentData = dataList[currentId]
         currentSent = currentData[2]
         currentPreList = currentData[3]
         currentLatList = currentData[4]
         x.append(' '.join(word_punct_tokenizer.tokenize(currentSent)))
         for item in currentPreList:
             x.append(' '.join(word_punct_tokenizer.tokenize(item)))
         for item in currentLatList:
             x.append(' '.join(word_punct_tokenizer.tokenize(item)))
     return x
Exemplo n.º 9
0
def data_cleaner(text):

    tokenizer = WordPunctTokenizer()

    pat_1 = r"(?:\@|https?\://)\S+"
    pat_2 = r'#\w+ ?'
    combined_pat = r'|'.join((pat_1, pat_2))
    www_pat = r'www.[^ ]+'
    html_tag = r'<[^>]+>'
    negations_ = {
        "isn't": "is not",
        "can't": "can not",
        "couldn't": "could not",
        "hasn't": "has not",
        "hadn't": "had not",
        "won't": "will not",
        "wouldn't": "would not",
        "aren't": "are not",
        "haven't": "have not",
        "doesn't": "does not",
        "didn't": "did not",
        "don't": "do not",
        "shouldn't": "should not",
        "wasn't": "was not",
        "weren't": "were not",
        "mightn't": "might not",
        "mustn't": "must not"
    }
    negation_pattern = re.compile(r'\b(' + '|'.join(negations_.keys()) +
                                  r')\b')

    try:
        stripped = re.sub(combined_pat, '', text)
        stripped = re.sub(www_pat, '', stripped)
        cleantags = re.sub(html_tag, '', stripped)
        lower_case = cleantags.lower()
        neg_handled = negation_pattern.sub(lambda x: negations_[x.group()],
                                           lower_case)

        if remove_punctuation:
            letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
            tokens = tokenizer.tokenize(letters_only)
        else:
            tokens = tokenizer.tokenize(neg_handled)

        return (" ".join(tokens)).strip()
    except:
        return 'NC'
def message_to_wordlist(message, lemmas_bool, remove_stopwords=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    #review_text = BeautifulSoup(review).get_text()
    #
    # 2. Remove messages numbers
    message_text = re.sub(">>\d+","", message)
    message_text = message_text.lower()
    message_text = re.sub(u"ё", 'e', message_text, re.UNICODE)
    message_text = clean_str(message_text)
    tokenizer = WordPunctTokenizer()
    # 3. Convert words to lower case and split them
    words = tokenizer.tokenize(message_text)
    lemmas = []
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    if lemmas_bool == 'l':
        for word in words:
            word_parsed = morph.parse(word)
            if len(word_parsed) > 0:
                lemmas.append(word_parsed[0].normal_form)
    elif lemmas_bool == 's':
        for word in words:
            word = stemmer.stem(word)
            if len(word) > 0:
                lemmas.append(word)
    else:
        lemmas = words
    # 5. Return a list of words
    return(lemmas)
Exemplo n.º 11
0
def clean_data(input_file_name, output_file_name):
    def clean_word(word):
        word = word.encode('ascii', 'ignore')
        word = word.lower()
        word = re.sub(r'(\S)\1+', r'\1\1', word)  # normalize repeated characters to two
        word = re.sub(r'(\S\S)\1+', r'\1\1', word)

        if re.search(r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-]*)?\??(?:[-\+=&;%@.\w]*)#?(?:[\w]*))?)',word) is not None:
            word = 'GENERIC_HTTP'

        return word

    tokenizer = WordPunctTokenizer()
    data = []
    with open(input_file_name) as input_file:
        for sentences, label in json.load(input_file):
            cleaned_sentences = []
            for sentence in sentences:
                cleaned_sentence = " ".join(map(clean_word, sentence.split()))
                cleaned_sentence = tokenizer.tokenize(cleaned_sentence)
                cleaned_sentences.append(cleaned_sentence)

            data.append([cleaned_sentences, label])

    with codecs.open(output_file_name, 'w', encoding='utf-8') as output_file:
        json.dump(data, output_file)
Exemplo n.º 12
0
def clean_data(input_file_name, output_file_name):
    def clean_word(word):
        word = word.lower()
        word = word.replace('&amp;','&').replace('&lt;','<').replace('&gt;','>').replace('&quot;','"').replace('&#39;',"'")
        word = re.sub(r'(\S)\1+', r'\1\1', word)  # normalize repeated characters to two
        word = re.sub(r'(\S\S)\1+', r'\1\1', word)

        word = word.encode('ascii', 'ignore')

        if re.search(r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-]*)?\??(?:[-\+=&;%@.\w]*)#?(?:[\w]*))?)',word) is not None:
            word = 'GENERIC_HTTP'

        return word.encode('ascii', 'ignore')

    tokenizer = WordPunctTokenizer()

    with gzip.open(input_file_name) as input_file:
        with gzip.open(output_file_name, 'w') as output_file:
            for line in input_file:
                sentences, score = json.loads(line)
                cleaned_sentences = []
                for sentence in sentences:
                    cleaned_sentence = " ".join(map(clean_word, sentence.split()))
                    cleaned_sentences.append(tokenizer.tokenize(cleaned_sentence))

                json.dump([cleaned_sentences, score], output_file)
                output_file.write("\n")
Exemplo n.º 13
0
    def number_of_different_words(self):
        # TODO: Stemming, then move to language specific classes
        tokenizer = WordPunctTokenizer()
        words = tokenizer.tokenize(self.text.strip())
        only_textual_words = filter(unicode.isalpha, words)

        return len(set(only_textual_words))
Exemplo n.º 14
0
def test_name():
    filename = "name.txt"
    name_file = 'tests/test_files/' + filename
    output_dir = 'tests/test_files/redacted/'

    main.init_stats(name_file, 0, None)

    # Get test file
    content = main.get_file_contents(name_file)

    # Used to split the file for POS analysis
    word_punct_tokenizer = WordPunctTokenizer()
    tagged_content = nltk.pos_tag(word_punct_tokenizer.tokenize(content))

    # Redacte
    content = main.redact_names(content, tagged_content, name_file)

    # X nameed words in file
    assert (main.num_names[name_file] == 22)

    # Create path
    if (not os.path.isdir(output_dir)):
        sys.stderr.write("Output directory did not exist...creating " +
                         output_dir + "/\n")
        os.makedirs(output_dir)

    # Write out the redacted test file for reference
    main.write_redacted(content, name_file, output_dir)
def clean_data(input_file_name, output_file_name):
    def clean_word(word):
        word = word.encode('ascii', 'ignore')
        word = word.lower()
        word = re.sub(r'(\S)\1+', r'\1\1',
                      word)  # normalize repeated characters to two
        word = re.sub(r'(\S\S)\1+', r'\1\1', word)

        if re.search(
                r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w-]*)?\??(?:[-\+=&;%@.\w]*)#?(?:[\w]*))?)',
                word) is not None:
            word = 'GENERIC_HTTP'

        return word

    tokenizer = WordPunctTokenizer()
    data = []
    with open(input_file_name) as input_file:
        for sentences, label in json.load(input_file):
            cleaned_sentences = []
            for sentence in sentences:
                cleaned_sentence = " ".join(map(clean_word, sentence.split()))
                cleaned_sentence = tokenizer.tokenize(cleaned_sentence)
                cleaned_sentences.append(cleaned_sentence)

            data.append([cleaned_sentences, label])

    with codecs.open(output_file_name, 'w', encoding='utf-8') as output_file:
        json.dump(data, output_file)
Exemplo n.º 16
0
def message_to_wordlist(message, lemmas_bool, remove_stopwords=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    #review_text = BeautifulSoup(review).get_text()
    #
    # 2. Remove messages numbers
    message_text = re.sub(">>\d+","", message)
    message_text = message_text.lower()
    message_text = re.sub(u"ё", 'e', message_text, re.UNICODE)
    tokenizer = WordPunctTokenizer()
    # 3. Convert words to lower case and split them
    words = tokenizer.tokenize(message_text)
    lemmas = []
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("russian"))
        words = [w for w in words if not w in stops]
    if lemmas_bool == 'l':
        for word in words:
            word_parsed = morph.parse(word)
            if len(word_parsed) > 0:
                lemmas.append(word_parsed[0].normal_form)
    elif lemmas_bool == 's':
        for word in words:
            word = stemmer.stem(word)
            if word and w.isalpha():
                lemmas.append(word)
    else:
        lemmas = words
    # 5. Return a list of words
    return(lemmas)
Exemplo n.º 17
0
def tokenize_words(sentence):
    """
    :param sentence:
    :return: list of words in sentence
    """
    tokenizer = WordPunctTokenizer()
    return tokenizer.tokenize(sentence)
Exemplo n.º 18
0
def test_concept():
    filename = "concept.txt"
    concept_file = 'tests/test_files/' + filename
    output_dir = 'tests/test_files/redacted/'

    main.init_stats(concept_file, 0, None)

    # Get test file
    content = main.get_file_contents(concept_file)

    # Used to split the file for POS analysis
    word_punct_tokenizer = WordPunctTokenizer()
    tagged_content = nltk.pos_tag(word_punct_tokenizer.tokenize(content))

    # Make required dot structure.
    # See https://stackoverflow.com/questions/2352181/how-to-use-a-dot-to-access-members-of-dictionary
    arg = {"concept": ["child"]}
    args = temp(arg)

    # Redacte
    content = main.redact_concept(content, concept_file, args)

    # X concept words in file
    assert (main.num_concept[concept_file] == 12)

    # Create path
    if (not os.path.isdir(output_dir)):
        sys.stderr.write("Output directory did not exist...creating " +
                         output_dir + "/\n")
        os.makedirs(output_dir)

    # Write out the redacted test file for reference
    main.write_redacted(content, concept_file, output_dir)
 def _tokenize(self, text):
     tk = WordPunctTokenizer()
     result = tk.tokenize(text)
     if DEBUG:
         print("Result after tokenizing: "),
         print(result)
     return result
Exemplo n.º 20
0
def to_index(vocab, texts, add_os=True):
    words_indices = []

    tokenizer = WordPunctTokenizer()
    lemmatizer = WordNetLemmatizer()

    # maxlen = 0

    for text in texts:
        words = tokenizer.tokenize(text)
        lemmas = [lemmatizer.lemmatize(w) for w in words]
        # maxlen = max(maxlen, len(lemmas))
        words_index = []
        if add_os is True:
            words_index.append(SOS_ID)  # 开头
        for lemma in lemmas:
            if lemma in vocab:
                words_index.append(vocab[lemma])
            else:
                words_index.append(UNK_ID)  # <unk>
        if add_os is True:
            words_index.append(EOS_ID)  # 结尾
        words_indices.append(words_index)

    # print maxlen

    return words_indices
Exemplo n.º 21
0
def get_vocab(data_list, vocab_size=None):
    # 处理,统计词频
    tokenizer = WordPunctTokenizer()
    lemmatizer = WordNetLemmatizer()
    all_words = []
    for record in data_list:
        for passage in record['passages']:  # 都看吧,不要只看选中的
            passage_words = tokenizer.tokenize(
                passage['passage_text'])  # 分词;要不要全转小写,再看
            passage_lemma_words = [
                lemmatizer.lemmatize(w) for w in passage_words
            ]  # 词形还原
            all_words.extend(passage_lemma_words)
    vocab_dict = collections.Counter(all_words)
    # 取词频最大的vocab_size个词,另外再加一个<unk>统计其他词,不用加,只需到时不在了index变一下
    # (对于一起训练的都是这样,对于用预训练embedding的则在配的时候不在表中的是0向量)
    if vocab_size is not None:
        vocab_list = vocab_dict.most_common(vocab_size)  # 就是倒序排的
    else:
        vocab_list = vocab_dict.most_common()  # 所有元素
    print len(vocab_list)
    vocab = {
    }  # 另外,'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3,没有实际替换,在index时直接用数字代替即可
    i = EOS_ID + 1
    for v in vocab_list:
        vocab[v[0]] = i  # 只留词和位置索引
        i += 1
    # print vocab  # 常用词挺多的,要不要考虑去停用词和标点
    return vocab
Exemplo n.º 22
0
def clean_tweets(tweet):
    """
    Función para limpiar los tweets antes de ser enviados a la API de análisis de
    sentimiento.

    Nota:   La API de Google es bastante flexible a la hora de realizar análisis de
            sentimiento. No estoy seguro de que todas estas "limpiezas" sean del todo
            necesarias.

    Args:
        tweet: Tweet (o texto) a limpiar.

    Returns:
        clean_tweet: Tweet ya limpio para proceder a realizar análisis de sentimiento.
    """

    # Removemos el usuario en el tweet
    user_removed = re.sub(r'@[A-Za-z0-9]+', '', tweet.decode('utf-8'))

    # Removemos cualquier link presente en el tweet
    link_removed = re.sub('https?://[A-Za-z0-9./]+', '', user_removed)

    # llevamos todo a minúsculas
    lower_case_tweet = link_removed.lower()

    # Instanciamos un tokenizador y, de aucerdo a sus reglas, creamos la lista de tokens
    tok = WordPunctTokenizer()
    words = tok.tokenize(lower_case_tweet)

    # Unimos los tokens para crear un único string a ser enviado
    clean_tweet = (' '.join(words)).strip()

    return clean_tweet
def get_feeds(name):
    tweets_list = []
    if consumer_key == '':
        f = open("example.txt")
        ttt = f.readlines()
        for t in ttt:
            tweets_list.extend(t)
        return tweets_list
    else:
        auth = OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_secret)
        api = tweepy.API(auth)
        new_tweet = api.user_timeline(screen_name=name, count=50)
        tweets_list.extend(new_tweet)
        # Processing the tweets
        cleaned_tweets_list = []  # All cleaned tweets are stored in this list
        for status in tweets_list:
            tweet_i = status.text.encode('utf-8')
            removed = re.sub(r'@[A-Za-z0-9]+', '', tweet_i.decode('utf-8'))
            link_rm = re.sub('https?://[A-Za-z0-9./]+', '', removed)
            number_rm = re.sub('[^a-zA-Z]', ' ', link_rm)
            lower = number_rm.lower()
            tok = WordPunctTokenizer()
            words = tok.tokenize(lower)
            cleaned = (' '.join(words)).strip()
            cleaned_tweets_list.append(cleaned)
        return cleaned_tweets_list
Exemplo n.º 24
0
    def normalize(cls, input_doc, language="english"):
        '''
        Normalize given input.
        '''

        # Remove special-chars
        if language == "german":
            processed_doc = re.sub(cls.NON_ALPHA_GER, '', input_doc)
        else:
            processed_doc = re.sub(cls.NON_ALPHA_GER, '', input_doc)
        # To Lowercase and Strip Whitespaces
        processed_doc = processed_doc.lower().strip()

        # Tokenize
        tokenizer = WordPunctTokenizer()
        tokens = tokenizer.tokenize(processed_doc)

        # Remove Stopwords.
        if language == "german":
            stop = stopwords.words("german")
            cleaned_tokens = [token for token in tokens if token not in stop]
        else:
            stop = stopwords.words("english")
            cleaned_tokens = [token for token in tokens if token not in stop]

        processed_doc = ' '.join(cleaned_tokens)

        return processed_doc
Exemplo n.º 25
0
def clean_text(text):
    """
    A function to pre-process text

    Parameters
    ----------
    text : string
        the string to be processed
    Returns
    -------
    text : string
        a clean string
    """
    tok = WordPunctTokenizer()
    pat1 = r'@[A-Za-z0-9]+'
    pat2 = r'https?://[A-Za-z0-9./]+'
    combined_pat = r'|'.join((pat1, pat2))
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()
Exemplo n.º 26
0
 def compute_word_context_matrix(self, window=5):
     tokenizer = WordPunctTokenizer()
     self.word_context_matrix = np.zeros(
         (len(self.corpus.vocabulary), len(self.corpus.vocabulary)))
     for doc_id in range(self.corpus.size):
         print doc_id
         document = self.corpus.full_content(doc_id)
         terms = tokenizer.tokenize(document)
         nb_terms = len(terms)
         for i in range(nb_terms):
             row_index = self.corpus.id_for_word(terms[i])
             if row_index != -1:
                 start = i - window
                 if start < 0:
                     start = 0
                 end = i + window
                 if end >= nb_terms:
                     end = nb_terms - 1
                 context0 = terms[start:i]
                 context1 = terms[i + 1:end + 1]
                 context0.extend(context1)
                 for term in context0:
                     column_index = self.corpus.id_for_word(term)
                     if column_index != -1:
                         self.word_context_matrix[row_index][
                             column_index] += 1
Exemplo n.º 27
0
    def tag(self, sent):
        times = self.find_time(sent)
        intervals = dict([(time[0], time[1]) for time in times])
        tag_dict = dict([(time[2], time[3]) for time in times])
        tokenizer = WordPunctTokenizer()
        # for a in [time[2] for time in times]:
        #     tokenizer.add_mwe(a.split())

        # --- FIXED ---
        original_tokens = tokenizer.tokenize(sent)
        original_tags = pos_tag(original_tokens)
        # --- END FIXED ---

        tokens = []
        current = 0
        for span in tokenizer.span_tokenize(sent):
            if span[0] < current:
                continue
            if span[0] in intervals:
                tokens.append(f'__{sent[span[0]: intervals[span[0]]]}')
                current = intervals[span[0]]
            else:
                tokens.append(sent[span[0]:span[1]])
                current = span[1]

        tags = pos_tag(tokens)

        new_tags = []
        for word, tag in tags:
            if word[:2] == '__':
                new_tags.append((word[2:], tag_dict[word[2:]]))
            else:
                tag = [t[1] for t in original_tags if t[0] == word][0]  # FIXED
                new_tags.append((word, tag))
        return new_tags
Exemplo n.º 28
0
 def transferDataw2v(self, allLabeledList, trainID, alpha=0.1):
     word_punct_tokenizer = WordPunctTokenizer()
     total = [0] * len(self.avilableLabels)
     xtrain = []
     pretrain = []
     lattrain = []
     ytrain = []
     for currentId in trainID:
         currentData = dataList[currentId]
         currentLabel = currentData[1]
         currentSent = currentData[2]
         currentPreList = currentData[3]
         currentLatList = currentData[4]
         if currentLabel in self.avilableLabels:
             idx = self.avilableLabels.index(currentLabel)
             total[idx] += 1
             binLabel = label_binarize([currentLabel],
                                       self.avilableLabels).tolist()[0]
             w2vList, fofeCode = self.sentW2v(
                 word_punct_tokenizer.tokenize(currentSent), self.ebd_size)
             xtrain.append(w2vList)
             prefofe = self.w2vEncoding(currentPreList,
                                        word_punct_tokenizer,
                                        sentLevelAlpha=alpha)
             latfofe = self.w2vEncoding(currentPreList,
                                        word_punct_tokenizer,
                                        reverse=True,
                                        sentLevelAlpha=alpha)
             pretrain.append(prefofe)
             lattrain.append(latfofe)
             ytrain.append(binLabel)
     return xtrain, pretrain, lattrain, ytrain, total
Exemplo n.º 29
0
def get_matrix_of_concatenated_document_embeddings(embeddings, n_dim, texts, token_limit=20, stop_words=[''], scale=False):
    """

    :param embeddings:
    :param n_dim:
    :param texts:
    :param n_tokens:
    :param stop_words:
    :param scale:
    :return:
    """

    scaler = preprocessing.MaxAbsScaler()
    # scaler = preprocessing.MinMaxScaler()
    tokenizer = WordPunctTokenizer()

    matrix = np.zeros((len(texts), token_limit*n_dim))
    for i_texts in range(0, len(texts)):
        tokens = tokenizer.tokenize(texts[i_texts])
        tmp = []
        for i_token in range(0, token_limit):
            cur_embedding = [0] * n_dim
            # if text still has tokens left, the current token is in the embeddings, and it is not on the stop word list
            if i_token < len(tokens) and tokens[i_token] in embeddings.keys() and not tokens[i_token] in stop_words:
                tmp_embedding = scaler.fit_transform(embeddings[tokens[i_token]]) if scale else embeddings[tokens[i_token]]
                cur_embedding = tmp_embedding.tolist()
            tmp += cur_embedding

        matrix[i_texts] = np.array(tmp)

    return matrix
Exemplo n.º 30
0
def lemmatize(text):
    word_punct_tokenizer = WordPunctTokenizer()
    tokens = word_punct_tokenizer.tokenize(text)
    lem = WordNetLemmatizer()
    ps = PorterStemmer()
    return [lem.lemmatize(w.lower()) for w in tokens
            ]  # [lem.lemmatize(ps.stem(w.lower())) for w in tokens]
Exemplo n.º 31
0
class w2vModel:
    def __init__(self):
        self.tokenizer = WordPunctTokenizer()
        #加载模型word2Vec
        self.word2VecModel = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin',binary=True)
    #计算短文本的距离
    def getWordDistance(self,word1,word2):
        if word1=='' or word2=='':
            return 0
        A = self.tokenizer.tokenize(word1)#分词
        B = self.tokenizer.tokenize(word2)
        scores=[]
        for w1 in A:
            ss=[]
            for w2 in B:
                try:
                    ss.append(self.word2VecModel.similarity(w1,w2))
                except:
                    if w1==w2:
                        ss.append(1)
                    else:
                        ss.append(0)
            scores.append(ss)
        La = 0
        Lb = 0
        for i in range(len(A)):
            La += max(scores[i])
        La /= len(A)
        for i in range(len(B)):
            maxnum=0
            for j in range(len(A)):
                maxnum = scores[j][i] if scores[j][i]>maxnum else maxnum
            Lb += maxnum
        Lb /= len(B)
        return (La+Lb)/2
class CocoDataset(Dataset):
    def __init__(self, image_dir, caption_dir, n_samples=5000, transform=None):
        self.image_dir = image_dir
        self.caption_dir = caption_dir
        self.transform = transform
        self.file_names = os.listdir(self.image_dir)[:n_samples]
        self.word_tokenizor = WordPunctTokenizer()
        self.id_to_captions = {}
        for id_caption in open(caption_dir,
                               encoding='utf-8').read().strip().split('\n'):
            if len(id_caption.split('\t')) == 2:
                id, caption = id_caption.split('\t')
                self.id_to_captions[id] = caption.lower()

    def __len__(self):
        return len(self.file_names)

    def __getitem__(self, idx):
        image = Image.open(os.path.join(self.image_dir, self.file_names[idx]))
        image = image.convert('RGB')
        caption = [
            word_to_id[word]
            if word in word_to_id.keys() else word_to_id['UNK']
            for word in self.word_tokenizor.tokenize(self.id_to_captions[
                self.file_names[idx]])
        ]
        caption = torch.Tensor(caption + [word_to_id['<EOS>']],
                               device=device).view(-1, 1)
        caption = caption.long()

        if self.transform:
            image_new = self.transform(image)
        sample = {'image': image_new, 'caption': caption}
        return sample
Exemplo n.º 33
0
class fred_language_analyser(language_analyser):
	''' a own analyser based on nltk with an asshole algoritme 
	'''
	def __init__(self, language = 'french'):
		'''Initialisation
			language	:	'french'
		'''
		self.tokenizer = WordPunctTokenizer()		
		self.stopwords = set(stopwords.words(language))
		self.stopwords.add(u"'")
	
	def text_to_vector(self, text):
		 tokens = self.tokenizer.tokenize(text)
		 tokens = [token for token in tokens if token.lower() not in self.stopwords]
		 return tokens
	
	def distance(self, text1, text2):
		v1 = self.text_to_vector(text1)
		v2 = self.text_to_vector(text2)
		#En attendant l'optimisation, on limite à 6 mots
		v1 = v1[0:6]
		v2 = v2[0:6]
		n = max(len(v1),len(v2))
		if len(v1)>len(v2):
			v1,v2 = v2,v1
		v1_1 = v1 + [None]*(n-len(v1))
		distance = 99
		for v1_2 in itertools.permutations(v1_1):#un peu boeuf : on permutte aussi les None avec les None
			#Distance entre les mots
			d_mot=0
			for i in range(n):
				try:
					d_mot += (6-min(6,edit_distance(v1_2[i],v2[i])))**2
				except:
					d_mot += 1 #si None
			d_mot = 6*(n**0.5)-d_mot**0.5
			#distance de la permuttation
			#Nb de Non insérés = nb de None pas au début ni à la fin
			v1_3 = []
			debut = True
			for m in v1_2:
				if m or not debut:
					debut = False
					v1_3.append(m)
			v1_4 = []
			debut = True
			for i in range(len(v1_3)-1,-1,-1):
				if v1_3[i] or not debut:
					debut = False
					v1_4.append(v1_3[i])
			d_perm = len(v1_4)-len(v1)
			#Les permutation de mot : 3 par permutation
			l=[]
			for m in list(filter(lambda x:x,v1_4)):
				l.append(v1.index(m))
			for i in range(len(l)-1):
				if l[i]<l[i+1]:
					d_perm +=3
			distance = min(distance, (d_mot**2+d_perm**2)**0.5)
		return distance
Exemplo n.º 34
0
    def words(self, fileid=None):
        """
        Returns all of the words and punctuation symbols in the specified file
        that were in text nodes -- ie, tags are ignored. Like the xml() method,
        fileid can only specify one file.

        :return: the given file's text nodes as a list of words and punctuation symbols
        :rtype: list(str)
        """

        elt = self.xml(fileid)
        encoding = self.encoding(fileid)
        word_tokenizer = WordPunctTokenizer()
        try:
            iterator = elt.getiterator()
        except:
            iterator = elt.iter()
        out = []

        for node in iterator:
            text = node.text
            if text is not None:
                if isinstance(text, bytes):
                    text = text.decode(encoding)
                toks = word_tokenizer.tokenize(text)
                out.extend(toks)
        return out
Exemplo n.º 35
0
def TextProcessor(src, tgt, low=True, num=True):

    print "processing "+src
    if low==True:
        print "lowercasing.."
    if num==True:
        print "removing numeric.."

    srcfile = codecs.open(src,"r","utf-8")
    tgtfile = codecs.open(tgt,"w","utf-8")

    word_punct_tokenizer = WordPunctTokenizer()

    linecount=0
    for line in srcfile:
        linecount+=1
        line = word_punct_tokenizer.tokenize(line)
        if low==True:
            for i in range(0,len(line)):
                line[i] = line[i].lower()
        if num==True:
            for i in range(0,len(line)):
                if line[i].isnumeric()==True:
                    line[i] = "<number>"

        tgtfile.write(listtostring(line))

    srcfile.close()
    tgtfile.close()
    print "done processing "+str(linecount)+" lines!!"
Exemplo n.º 36
0
class SentencesIterator(object):
    def __init__(self, dirname):
        self.dirname = dirname
        self.tokenizer = WordPunctTokenizer()

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            date = get_date(fname)
            with open(os.path.join(self.dirname, fname), 'r') as f:
                text = f.read()
            text = text.replace('Donald Trump', 'Donald_Trump')
            text = text.replace('Melania Trump', 'Melania_Trump')
            text = text.replace('Ivanka Trump', 'Ivanka_Trump')
            text = text.replace('Eric Trump', 'Eric_Trump')
            if date and date < ANNOUNCEMENT_DATE:
                text = text.replace('Trump', 'Trump_Pre_Campaign')
            elif date and date < ELECTION_DATE:
                text = text.replace('Trump', 'Trump_Pre_Election')
            elif date and date >= ELECTION_DATE:
                text = text.replace('Trump', 'Trump_Post_Election')

            text = text.replace("\xa0", " ").replace('“',
                                                     '"').replace('”', '"')
            sents = sent_tokenize(text)
            for sent in sents:
                yield self.tokenizer.tokenize(sent)
Exemplo n.º 37
0
def extract_nl_text(ms):
    """
    Extracts and tokenizes text from malware sample object

    :param ms: MalwareSample object
    :return: list of tokenized strings found in malware sample object's internal strings list
    """
    wpt = WordPunctTokenizer()
    all_tokenized_strings_in_ms = []
    inside_xml_privileges = False
    for s in ms.strings:
        if 'requestedPrivileges' in s or 'This program cannot be run in DOS mode' in s:
            continue
        elif inside_xml_privileges:
            continue
        elif '<assembly xmlns' in s:
            inside_xml_privileges = True
            continue
        elif '</assembly>' in s:
            inside_xml_privileges = False
            continue

        tokenized_string = []
        tokens = wpt.tokenize(s)
        if tokens:
            for t in tokens:
                if wordnet.synsets(t) and len(t) > 3:  # had to use length to eliminate false positives
                    tokenized_string.extend(tokens)
                    break
        if tokenized_string:
            all_tokenized_strings_in_ms.append(tokenized_string)
    return all_tokenized_strings_in_ms
Exemplo n.º 38
0
def main_wrapper():
    model, enc, device = init_model(42, "gpt2-xl")
    messages = []
    questions = np.load('all_questions.npy')[-30000:]
    answers = np.load('gpt_answers.npy').tolist()
    tokenizer = WordPunctTokenizer()
    for i in range(len(questions) // 30 + 1):
        for j, question in enumerate(questions[30 * i:30 * (i + 1)]):
            print("\n")
            #         input_text = input("Enter your message here: ")
            output_text = produce_answer(question,
                                         messages,
                                         30,
                                         10,
                                         1.0,
                                         False,
                                         model,
                                         enc,
                                         device,
                                         insert_intro=True,
                                         wrap_type='QA')
            output_text = " ".join(tokenizer.tokenize(output_text)[:30])
            print(j + 30 * i)
            print(question)
            print(output_text)
            answers.append(output_text)
        np.save('gpt_answers.npy', np.array(answers))
Exemplo n.º 39
0
 def get_words_without_stopwords(self, text):
     stopwords = nltk.corpus.stopwords.words('english')
     stopwords.extend(string.punctuation)
     stopwords.append('')
     tokenizer = WordPunctTokenizer()
     tokens = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(text) \
               if token.lower().strip(string.punctuation) not in stopwords]
     return tokens
Exemplo n.º 40
0
	def words(self, fileid=None):
		"""
		Returns all of the words and puncuation symbols in the specified file
		that were in 'section//p' text nodes.
		"""
		elt = self.xml(fileid).iterfind('.//section//p')
		word_tokenizer = WordPunctTokenizer()
		return [val for subl in [word_tokenizer.tokenize(nodetext) for nodetext in [''.join(el.itertext()) for el in elt]] for val in subl]
Exemplo n.º 41
0
def extract_words(text):
    stemmer = PorterStemmer()

    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(text)

    result =  [stemmer.stem(x.lower()) for x in tokens if x not in stopwords.words('english') and len(x) > 1]
    return result
Exemplo n.º 42
0
    def get_similarity_score(a, b):
        stopwords = nltk.corpus.stopwords.words('english')
        stopwords.extend(string.punctuation)
        stopwords.append('')
        tokenizer = WordPunctTokenizer()
        """Check if a and b are matches."""
        tokens_a = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(a) \
                    if token.lower().strip(string.punctuation) not in stopwords]

        tokens_b = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(b) \
                    if token.lower().strip(string.punctuation) not in stopwords]

        # Calculate Jaccard similarity
        ratio = 0
        if len(set(tokens_a).union(tokens_b)) > 0:
            ratio = len(set(tokens_a).intersection(tokens_b)) / float(len(set(tokens_a).union(tokens_b)))
        return (ratio)
Exemplo n.º 43
0
def get_tokens(sentence):
    """
    Tokenizes a list of sentences
    :param sentence: list of sentences
    :return: list of tokenized sentences
    """

    tokenizer = WordPunctTokenizer()
    return tokenizer.tokenize(sentence)
Exemplo n.º 44
0
Arquivo: seo.py Projeto: blorenz/cms
def getBigram(haystack):
    tokenizer = WordPunctTokenizer()
    words = tokenizer.tokenize(haystack)
    bcf = BigramCollocationFinder.from_words(words)
    stopset = set(stopwords.words('english'))
    filter_stops = lambda w: len(w) < 3 or w in stopset
    bcf.apply_word_filter(filter_stops)

    return bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)
Exemplo n.º 45
0
def tokenize(text): 
	tokens = tokenizer.tokenize(text)
	wordtokenizer = WordPunctTokenizer()
	wlist =[]
	for token in tokens:
		wtoken = wordtokenizer.tokenize(token)
		wlist = wlist+wtoken

	stems = stem_tokens(wlist, stemmer)
	return stems
def extract_words(text):
 stemmer = PorterStemmer()
 tokenizer = WordPunctTokenizer()
 tokens = tokenizer.tokenize(text)
 bigram_finder = BigramCollocationFinder.from_words(tokens)
 bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 500)
 for bigram_tuple in bigrams:
  x = "%s %s" % bigram_tuple
  tokens.append(x)
 result =  [stemmer.stem(x.lower()) for x in tokens if x not in stopwords.words('english') and len(x) > 1]
 return result 
Exemplo n.º 47
0
def get_bigrams(text):
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(text)
    result = []
    bigram_finder = BigramCollocationFinder.from_words(tokens)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)

    for bigram_tuple in bigrams:
        x = "%s %s" % bigram_tuple
        tokens.append(x)

    return tokens
Exemplo n.º 48
0
def you_collocations(raw):

    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(raw)

    bigrams = [(tokens[i], tokens[i +1]) for i in range(len(tokens)-1)]
    collocations = [(t1, t2) for (t1, t2) in bigrams if t1 == "you" or t1 == 'your']

    trigrams = [(tokens[i], tokens[i +1], tokens[i+2]) for i in range(len(tokens)-2)]
    trilocations = [(t1, t2, t3) for (t1, t2, t3) in trigrams if t1 == "you" or t1 == 'your']

    return collocations, trilocations
Exemplo n.º 49
0
def extract_bigrams(text):
    text = remove_stopwords(text)
    tokenizer = WordPunctTokenizer()
    tokens = [token for token in set(tokenizer.tokenize(text)) if
              not is_number(token) and (is_valid_token(token) or is_name(token))]
    bigram_finder = BigramCollocationFinder.from_words(tokens)
    bigrams = bigram_finder.nbest(BigramAssocMeasures.dice, 500)
    for bigram_tuple in bigrams:
        x = "%s %s" % bigram_tuple
        tokens.append(x)
    result = [x.lower() for x in tokens if x not in stopwords.words("english") and len(x) > 3]
    return result
Exemplo n.º 50
0
def decisionTreeClassifier():
	import nltk
	from nltk.tokenize import WordPunctTokenizer
	docId = request.args.get('d')
	tokenizer = WordPunctTokenizer()		
	collection = initialize_collection('documents')

	featuresets = []
	tagSet = set()
	for d in collection.find():	
		bagOfWords = bag_of_words(tokenizer.tokenize(d['content']))
		if 'tags' not in d: continue
		for tag in d['tags']:
			featuresets.append((bagOfWords, tag))
			tagSet.add(tag)
	classifier = nltk.DecisionTreeClassifier.train(featuresets)
	print classifier.pseudocode(depth=4)
	d = collection.find_one({'_id' : ObjectId(docId)})
	print classifier.classify(bag_of_words(tokenizer.tokenize(d['content'])))
	
	return 'hello'
	"""
Exemplo n.º 51
0
def build_word_dictionary(input_file_name, output_file_name):
    dictionary = Counter()
    with gzip.open(input_file_name) as input_file:
        for line in json.loads(input_file.read()):
            text, label = line
            # dictionary.update(text.split())
            tokenizer = WordPunctTokenizer()
            dictionary.update(tokenizer.tokenize(text))

    dictionary = list(sorted(w for w in dictionary if dictionary[w] >= 3)) + ["PADDING", "UNKNOWN"]

    with open(output_file_name, "w") as output_file:
        output_file.write("{}\n".format(json.dumps(dictionary)))
Exemplo n.º 52
0
def extract_words(text):
 	
    stemmer = PorterStemmer()
    if type(text) == str:
        text = unicode(text, "utf-8", errors="ignore")
    else:
        text = unicode(text)
     
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(text)
 
    result =  [stemmer.stem(x.lower()) for x in tokens if x not in stopwords.words('english') and len(x) > 1]
    return result
Exemplo n.º 53
0
def analyze(tweets):
    classifier = cache.get('classifier')
    if classifier is None:
        classifier = train_classifier()
        cache.set('classifier', classifier, None)
    tokenizer = WordPunctTokenizer()
    analyzed_tweets = []
    for tweet in tweets:
        tokens = tokenizer.tokenize(tweet.lower())
        featureset = word_feats(tokens)
        sentiment = classifier.prob_classify(featureset)
        analyzed_tweets.append(AnalyzedTweet(tweet, round(sentiment.prob('pos'),2), round(sentiment.prob('neg'),2)))
    return analyzed_tweets
Exemplo n.º 54
0
def build_word_dictionary(input_file_name, output_file_name):
    dictionary = Counter()
    with open(input_file_name) as input_file:
        for line in json.loads(input_file.read()):
            text, label = line
            tokenizer = WordPunctTokenizer()
            dictionary.update(tokenizer.tokenize(text))

    dictionary = list(sorted(w for w in dictionary if dictionary[w] >= 5)) + ['PADDING', 'UNKNOWN']
    # dictionary = list(sorted(w for w,c in dictionary.most_common(3000))) + ['PADDING', 'UNKNOWN']

    with open(output_file_name, 'w') as output_file:
        output_file.write("{}\n".format(json.dumps(dictionary)))
Exemplo n.º 55
0
def convert(sgm_path, apf_path, bio_path=None):
    xml_parser = etree.XMLParser(recover=True)
    try:
        sgm_tree = etree.parse(sgm_path, xml_parser)
        apf_tree = etree.parse(apf_path, xml_parser)
        if not bio_path:
            bio_path = os.path.commonprefix([sgm_path, apf_path]) + 'bio'
        output = open(bio_path, 'w')
    except:
        print 'Something wrong when opening/parsing xml file, or opening output file'
        return
    
    init_offset = get_init_offset(sgm_path)
    text = sgm_tree.xpath('/DOC/BODY/TEXT')[0].text.strip('\n')
    
    tokenizer = WordPunctTokenizer()
    tokens = tokenizer.tokenize(text)
    spans = list(tokenizer.span_tokenize(text))
    pos = pos_tag(tokens)
    
    ts = []
    for i in range(len(tokens)):
        t = token()
        t.text = tokens[i]
        t.pos = pos[i][1]
        t.span = (spans[i][0] + init_offset, spans[i][1] - 1 + init_offset)
        t.bio = 'O'
        ts.append(t)
        
    entits = apf_tree.xpath('/source_file/document/entity')
    for enty in entits:
        enty_type = enty.get('TYPE')
        mentions = enty.xpath('entity_mention')
        for m in mentions:
            head = m.xpath('head')[0]
            span = (int(head[0].get('START')), int(head[0].get('END')))
            found = False
            for t in ts:
                if t.span[0] == span[0]:
                    t.bio = 'B-' + enty_type
                    found = True
                if t.span[0] > span[0] and t.span[1] <= span[1]:
                    t.bio = 'I-' + enty_type
                    found = True
            if not found:
                print 'entity mention head span not found', span, apf_path
    
    for t in ts:
        #print t.text, t.span
        output.write('\t'.join([t.text, t.pos, t.bio]) + '\n')
    output.close()
def OnButtonClick ():
        file = tkFileDialog.askopenfile(parent=root,mode='rb',title='Select a file')
        if file != None:
            print "Initializing... Please Wait"
            ini_db()
            
            file_list=file.readlines()

            for line in file_list:
                
                line=line.strip()
                fp1=open(line,"r")
                document_count()
                text=fp1.read()    
                #dictonary to store word frequency in text(temporary)
                doc_word_freq={}
                #Tokenize 
                from nltk.tokenize import WordPunctTokenizer
                tokenizer = WordPunctTokenizer()
                text2=tokenizer.tokenize(text)
            



                #removing stopwords
                from nltk.corpus import stopwords
                eng_stop=set(stopwords.words('english'))
                text3=[word for word in text2 if word not in eng_stop]

                #pos tag
                import nltk
                text4=nltk.pos_tag(text3)
                text5=filter_for_tags(text4)


                #calculate frequency of word in the text
                for word in text5:
                    if word in doc_word_freq:
                        doc_word_freq[word] += 1
                    else:
                        if(word != "'"):
                            doc_word_freq[word] = 1

                #update occurance of word in global table
                for (word,freq) in doc_word_freq.items():
                    if (check(word)):
                        update_record(word)
                    else:
                        add_new_word(word)
            print "Initialization Done...\n\n"
            file.close()
Exemplo n.º 57
0
 def word_tokenizePT(self,  text, tokenizer):
     """ tokenize a portuguese sentence in words
     @input params: sentence - a sentence, a phrase (self)
                    tokenizer - "TB" for TreebankWordTokenizer
                                "WP" for WordPunctTokenizer
     @returns word's list or error """
     if tokenizer == "TB":
         tokenizerTB = TreebankWordTokenizer()
         return tokenizerTB.tokenize(text)
     elif tokenizer == "WP":
         tokenizerWP = WordPunctTokenizer()
         return tokenizerWP.tokenize(text)
     else:
         return "tokenizer error: not found"