예제 #1
0
	def word_vec_matrix(self, model, one_hot):
		training_data = {"article":[],"summaries":[]}
		i=1
		for k in range(len(self.data["articles"])):
			art=[]
			summ=[]
			for word in wt(self.data["articles"][k].lower()):
				try:
				    art.append(model.wv.word_vec(word))
				except Exception as e:
				    print(e)

			for word in wt(self.data["summaries"][k].lower()):
				try:
					summ.append(one_hot[word])
					#summ.append(model.wv.word_vec(word))
				except Exception as e:
					print(e)

			training_data["article"].append(art) 
			training_data["summaries"].append(summ)
			if i%100==0:
				logger.info("progress: " + str(((i*100)/len(self.data["articles"]))))
			i+=1
		
		print('\007')
		return training_data
예제 #2
0
def dict_build():
    print("------------Building Dictionary------------")
    #build dic from Uottawa an reuters json Corpus
    #   *Full text, Altered Text, Stemmed, stopwords removal, normalized
    #Acess corpus
    uottawa_json = os.path.dirname(os.path.join(
        os.getcwd())) + "/Json_data/uottawa_corpus.json"
    reuters_json = os.path.dirname(os.path.join(
        os.getcwd())) + "/Json_data/final_reuters_corpus.json"
    corpus_collection = [
        uottawa_json, reuters_json
    ]  #corpus_collection = ["reuters_corpus.json", "uottawa_corpus.json"]
    #Dictionary Json Structure
    dict = {
        'fullText': set(),
        'alteredText': set(),
        'stemmedText': set(),
        'stopWord': set(),
        'normalized': set()
    }
    #Enumerate allows us to loop over something and have an automatic counter.
    #docID -> article number starting with 1
    #Find title body topic create snippet
    for id, corpus_data in enumerate(corpus_collection):
        with open(corpus_data) as corpus:
            data = json.load(corpus)

            for values in data:
                tokenized_title = [
                    word.lower() for word in wt(values['title'])
                    if word not in string.punctuation and not any(
                        i.isdigit() for i in word) and word != ""
                ]
                tokenized_description = [
                    word.lower() for word in wt(values['description'])
                    if word not in string.punctuation and not any(
                        i.isdigit() for i in word) and word != ""
                ]
                #dict fulltext
                dict['fullText'] |= set(tokenized_title)
                dict['fullText'] |= set(tokenized_description)
                #dict altered text [normalizing stemmed and stopwords removed title and description]
                dict['alteredText'] |= normalize(
                    stemming(remove_stopwords(tokenized_title)))
                dict['alteredText'] |= normalize(
                    stemming(remove_stopwords(tokenized_description)))
                #stopwords removal
                dict['stopWord'] |= remove_stopwords(tokenized_title)
                dict['stopWord'] |= remove_stopwords(tokenized_description)
                #stemming [porter stemmer]
                dict['stemmedText'] |= stemming(tokenized_title)
                dict['stemmedText'] |= stemming(tokenized_description)
                #normalization
                dict['normalized'] |= normalize(tokenized_title)
                dict['normalized'] |= normalize(tokenized_description)

        with open('dictionary.json', 'w') as outfile:
            my_dict_lists = {k: list(v) for (k, v) in dict.items()}
            json.dump(my_dict_lists, outfile, ensure_ascii=False, indent=4)
    print("------------Done------------")
예제 #3
0
def preprocess_snli_jsonl(file_path, vocab_idx, out_file, vocab_size=30000):
    X1 = []
    X2 = []
    l1 = []
    l2 = []
    Y = []
    labels = {'neutral': 0, 'entailment': 1, 'contradiction': 2}
    with codecs.open(file_path, 'r', 'utf-8') as f:
        for line in f:
            line = json.loads(line)
            if line['gold_label'] not in labels:
                continue
            sentence1 = [w.lower() for w in wt(line['sentence1'])]
            s1 = []
            for w in sentence1:
                s1.append(vocab_idx[w] if w in vocab_idx else vocab_size - 1)
            sentence2 = [w.lower() for w in wt(line['sentence2'])]
            s2 = []
            for w in sentence2:
                s2.append(vocab_idx[w] if w in vocab_idx else vocab_size - 1)
            
            X1.append(np.array(s1))
            X2.append(np.array(s2))
            l1.append(len(s1))
            l2.append(len(s2))
            Y.append(labels[line['gold_label']])
    
    writer = codecs.open(out_file, 'wb')
    data = {'X1': np.array(X1), 'X2': np.array(X2), 'l1': np.array(l1), 'l2': np.array(l2), 'Y': np.array(Y)}
    pickle.dump(data, writer)
    writer.close()
예제 #4
0
def gen_signature(word):
    """Generate a signature for each candidate expansion, using contextual
       information from the Brown corpus, as well as WordNet definitions and
       examples (if applicable)."""
    if word in gen_signature.dict:
        return gen_signature.dict[word]
    inds = find_matches(word)
    if len(inds) > 50:
        f = len(inds) / 50
        inds = [inds[int(i * f)] for i in range(50)]
    signature = defaultdict(int)
    for i in inds:
        for w in gen_context(i, brown):
            signature[w] += 1
    sig = {w for w in signature
           if signature[w] > 1
           and w not in stopwords.words('english') and w != ','}
    if word in wn.words():
        if wn.synsets(word) and str(wn.synsets(word)[0]).count("'") == 2:
            define = (eval("wn.{}.definition()".format(
                      str(wn.synsets(word)[0]).lower())))
            examples = (eval("wn.{}.examples()".format(
                        str(wn.synsets(word)[0]).lower())))
            if examples:
                for ex in examples:
                        sig.update([w for w in wt(ex)
                                   if w not in stopwords.words('english')])
            if define:
                        sig.update([w for w in wt(define)
                                   if w not in stopwords.words('english')])
    gen_signature.dict[word] = sig
    return sig
def wordvecmatrix(model,data):
    IO_data={"article":[],"summaries":[]}
    i=1
    for k in range(len(data["articles"])):
        art=[]
        summ=[]
        for word in wt(data["articles"][k].lower()):
            try:
                art.append(model.wv.word_vec(word))
            except Exception as e:
                print(e)

        for word in wt(data["summaries"][k].lower()):
            try:
                summ.append(onehot[word])
                #summ.append(model.wv.word_vec(word))
            except Exception as e:
                print(e)
        
        IO_data["article"].append(art) 
        IO_data["summaries"].append(summ)
        if i%100==0:
            print("progress: " + str(((i*100)/len(data["articles"]))))
        i+=1
    #announcedone()
    print('\007')
    return IO_data
예제 #6
0
    def create_wordclouds(self, text, name_of_cloud, additional_stop_list, max_words, width, height, bigram = False):
        text_nopunc = self.remove_punctuation(text, "", "")
        text_lower = text_nopunc.lower()
        stop = self.stopwords
        stop.extend(additional_stop_list)
        text_nostop = self.remove_stopword(text_lower, stop)
        tokens = wt(text_nostop)
        text_lem = self.lemmatize(tokens)
        tokens_lem = wt(text_lem)
        my_bigrams = nltk.bigrams(tokens_lem)
        if bigram:
            bigram_merged=list()
            for line in my_bigrams:
                bigram_merged.append(line[0]+' ' + line[1])
            counts = collections.Counter(bigram_merged)
        else:
            counts = collections.Counter(tokens_lem)
        final = counts.most_common(max_words)
        max_count = max(final, key=operator.itemgetter(1))[1]
        final = [(name, count / float(max_count))for name, count in final]

        # tags = make_tags(final, maxsize=max_word_size)
        # create_tag_image(tags, name_of_cloud+'.png', size=(width, height), layout=3, fontname='Crimson Text', background = (255, 255, 255))

        # temp_cloud = " ".join(text for text, count in final)
        word_cloud = WordCloud(font_path="fonts/Georgia.ttf",
            width=width, height=height, max_words=max_words, stopwords=stop)
        word_cloud.fit_words(final)
        word_cloud.to_file(name_of_cloud + ".png")
예제 #7
0
def inference(x1, x2):
    #tokenize and pad
    x1 = wt(x1.lower().strip())
    x2 = wt(x2.lower().strip())

    if len(x1) >= 16:
        x1 = x1[:16]
    else:
        while (len(x1) < 16):
            x1.append("pad")

    if len(x2) >= 16:
        x2 = x2[:16]
    else:
        while (len(x2) < 16):
            x2.append("pad")
    q1 = []
    q2 = []
    for word in x1:
        try:
            q1.append(model1.wv.word_vec(word))
        except Exception as e:
            q1.append(model1.wv.word_vec("pad"))
            continue
    for word2 in x2:
        try:
            q2.append(model1.wv.word_vec(word2))
        except Exception as e2:
            q2.append(model1.wv.word_vec("pad"))
            continue

    x1 = np.asarray(q1, dtype='float32').reshape((1, 16, 256))
    x2 = np.asarray(q2, dtype='float32').reshape((1, 16, 256))
    sim_prob = siamese_model.predict([x1, x2])
    return sim_prob[0][0]
def prepareData(CandidateList):
    positiveText = ""
    negativeText = ""
    neutralText = ""

    vectors = []
    labels = []

    for candidate in CandidateList:
        positiveDict = candidate.positive
        for item in positiveDict:
            text = positiveDict[item].tweet
            positiveText += text
            vec = text.split()
            vectors.append(vec)
            labels.append("positive")
    for candidate in CandidateList:
        negativeDict = candidate.negative
        for item in negativeDict:
            text = negativeDict[item].tweet
            negativeText += text
            vectors.append(vec)
            labels.append("negative")
    for candidate in CandidateList:
        neutralDict = candidate.neutral
        for item in neutralDict:
            text = neutralDict[item].tweet
            neutralText += text
            vectors.append(vec)
            labels.append("neutral")
    positiveTokens = wt(positiveText)
    negativeTokens = wt(negativeText)
    neutralTokens = wt(neutralText)

    positiveDist = freq(positiveTokens)
    negativeDist = freq(negativeTokens)
    neutralDist = freq(neutralTokens)

    tempVector = defaultdict()

    mostCount = 30
    mostPositive = positiveDist.most_common(mostCount)
    mostNegative = negativeDist.most_common(mostCount)
    mostNeutral = neutralDist.most_common(mostCount)

    for mytuple in positiveDist.items():
        if mytuple not in mostPositive and mytuple[1] > 1:
            tempVector[len(tempVector)] = mytuple[0]
    for mytuple in negativeDist.items():
        if mytuple not in mostNegative and mytuple[1] > 1:
            tempVector[len(tempVector)] = mytuple[0]
    for mytuple in neutralDist.items():
        if mytuple not in mostNeutral and mytuple[1] > 1:
            tempVector[len(tempVector)] = mytuple[0]

    print len(tempVector)
    tempvector = {tempVector[w]: w for w in tempVector}
    print len(tempvector)
    return (vectors, labels, tempvector)
예제 #9
0
def kmtokens(text, *args):
    if len(args)==0:
        tokens=wt(text)
        return [stemmer.stem(token.strip()) for token in tokens if len(token)>1 and not token in stopwords]
    if args[0]=='pos':
        goodpos=['N', 'J', 'R', 'V']
        tokens=nltk.pos_tag(wt(text))
        return [stemmer.stem(token[0].strip()) for token in tokens if len(token[0])>1 and not token[0] in stopwords]
예제 #10
0
def get_details(url_arg):
    url = urlopen(url_arg)
    html = url.read()
    url.close()

    soup = bs(html, "html.parser")

    movie_name = " ".join(
        wt(soup.title.get_text())[:wt(soup.title.get_text()).index('Reviews')])
    total_reviews = int(wt(soup.find(align="right").get_text())[0])
    total_pages = math.ceil(total_reviews / 10)

    return (movie_name, total_reviews, total_pages)
예제 #11
0
def get_details(movie_url):
    print(movie_url)
    url = urlopen(movie_url)
    html = url.read()
    url.close()

    soup = bs(html, "html.parser")

    movie = " ".join(
        wt(soup.title.get_text())[:wt(soup.title.get_text()).index("Reviews")])
    total_reviews = int(wt(soup.find(align="right").get_text())[0])
    total_pages = int(math.ceil(total_reviews / 10))

    return (movie, total_reviews, total_pages)
예제 #12
0
def removeStopWords(t):
    stop_words = stopwords.words("english")
    word_feature_vector = []
    for word in wt(t):
        if word not in stop_words:
            word_feature_vector.append(word)
    return word_feature_vector
예제 #13
0
def analyze(text):

    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    print(text)
    token_words = wt(text, 'english')
    print(token_words)
    complete_text = []
    for word in token_words:
        if word not in stopwords.words('english'):
            complete_text.append(word)

    s_count = sexual_count(complete_text)
    p_count = physical_count(complete_text)
    slurs_count = slurs_count(complete_text)

    if (s_count == 0) and (p_count == 0) and (slurs_count == 0):
        return "Good news! Content warning are not aplicable to this literature!"
    else:
        output = """   """
        s = "There are " + s_count + " instances of sexually violent words."
        p = "There are " + p_count + " instances of physically violent words."
        ss = "There are " + slurs_count + " instances of physically violent words."
        output = s + p + ss
        return output
def process_unlabel_lapt_for_bilstmcrf(input_fn, output_fn):
    if os.path.exists(output_fn):
        print('data already exists', output_fn)
        return
    res = []
    for f in os.listdir(input_fn):
        if not f.endswith('json'):
            continue
        f = open(input_fn + '/' + f)
        js = json.load(f)
        f.close()
        reviews = js['Reviews']
        contents = [r['Content'] for r in reviews if r['Content'] is not None]
        res.extend(contents)
    
    with open(output_fn, 'w') as f:
        for content in res:
            content = content.strip().lower()
            sents = st(content)
            for sent in sents:
                tokens = wt(sent)
                for token in tokens:
                    f.write(token.encode('utf-8'))
                    f.write(' O\n')
                f.write('\n')
예제 #15
0
def word_overlap_score(inverted_index, docs, vectors, lam, default_dist=1):
    doc_len = len(docs)

    score_mat = np.full((doc_len, doc_len), default_dist, dtype="float32")

    index_array = lambda sents: reduce(
        lambda accu, indx_array: accu + indx_array,
        [inverted_index.get(word, []) for word in sents], [])

    tokens = [wt(sents.replace('.', '')) for sents in docs]
    words_inverded_index = list(map(index_array, tokens))
    count_dict_list = list(map(Counter, words_inverded_index))

    def get_score(i, j, intersect):
        n1 = len(tokens[i])
        n2 = len(tokens[j])

        overlap = 1 - 2 * float(intersect) / max(n1 + n2, 1)

        cosine = cosine_similarity(vectors[i].T, vectors[j].T)
        final_score = max((1 - lam) * overlap + lam * cosine.all(), 0)
        score_mat[i][j] = final_score
        score_mat[j][i] = final_score

    [[get_score(i, j, count_dict_list[i][j]) \
      for j in count_dict_list[i] if j >= i] \
     for i in tqdm(range(doc_len))]

    return score_mat
예제 #16
0
def stem_words(data):
    def feature_tokens(tokens):
        stemtokens = list()
        for i in range(len(tokens)):
            if tokens[i] == 'not':
                i += 1
                continue
            if tokens[i] not in stop_words and not tokens[i].endswith("i"):
                stemmed = ps.stem(tokens[i])
                if len(stemmed) > 2:
                    stemtokens.append(stemmed)
        return stemtokens

    # initiate list for counting word frequencies in the list of documents
    new_train = list()
    for rawtext in data:
        # remove line breaks, indenting, punctuation, contractions
        text = processText(rawtext)

        # adds all stems that aren't stopwords
        tokens = wt(text)
        stemtokens = feature_tokens(tokens)
        new_train.append(' '.join(stemtokens))
#    print(new_train)
    return new_train
예제 #17
0
def check_nouns_adverbs_adjectives(sentence):
    tokenized = wt(sentence)
    tags = nltk.pos_tag(tokenized)
    return " ".join([
        i[0] for i in tags
        if (i[1][0] == 'N') or (i[1][0] == 'R') or (i[1][0] == 'J')
    ])
예제 #18
0
def normalize_query_and(q):
    q = wt(q)
    normalized_q = []
    for word in q:
        if word not in punctuation:
            normalized_q.append(stemmer.stem(word))
    return normalized_q
예제 #19
0
파일: main.py 프로젝트: starry9t/TopicLabel
def preprocessTxt(txtfile, newfile):
    with open(txtfile, 'r') as reader:
        data = reader.readlines()
    reader.close()
    new_data = []
    renew_data = []
    pro_data = []
    for line in data:
        sents = wt(line.strip())
        new_data.append(sents)
    with open(newfile, 'w') as etf:
        for line in new_data:
            renew_line = []
            pro_line = []
            for word in line:
                word = str.lower(word)
                renew_line.append(word)
                etf.write('{} '.format(word))
                if word[0].isalpha():
                    pro_line.append(word)
                else:
                    pass
            renew_data.append(renew_line)
            pro_data.append(pro_line)
            etf.write('\n')
    etf.close()
    print('txtfile preprocessed.')

    return renew_data, pro_data
예제 #20
0
def build_vocab(train_file, vocab_file):
    vocab = defaultdict(int)
    with codecs.open(train_file, 'r', 'utf-8') as f:
        for line in f:
            d = json.loads(line)
            sentence1 = [w.lower() for w in wt(d['sentence1'])]
            sentence2 = [w.lower() for w in wt(d['sentence2'])]
            for word in sentence1:
                vocab[word] += 1
            for word in sentence2:
                vocab[word] += 1
    
    writer = codecs.open(vocab_file, 'wb')
    vocab = sorted(vocab.items(), key=itemgetter(1), reverse=True)
    pickle.dump(vocab, writer)
    writer.close()
예제 #21
0
def tfidfCounts(data):
    # initiate list for counting word frequencies in the list of documents
    count = {}
    for rawtext in data:
        # remove line breaks, indenting, punctuation, contractions
        text = processText(rawtext)
        # adds all stems that aren't stopwords
        tokens = wt(text)
        stemtokens = feature_tokens(tokens)
        docwords = {}
        # adds all bigrams in the form [not ___] (excluding stop words)
        #        stemtokens += [tokens[i] + ps.stem(tokens[i + 1]) for i in range(len(tokens)-1) if tokens[i] == 'not' and not tokens[i + 1] in stop_words]
        for stem in stemtokens:
            if not stem in count:
                count[stem] = 0
            if not stem in docwords:
                docwords[stem] = 0
        docInfo = 0
        for word in docwords:
            for stem in stemtokens:
                if word == stem:
                    docwords[word] += 1
        for word in docwords:
            count[word] += math.log(
                len(stemtokens) / docwords[word]) * docwords[word]
    return count
def summonehot(corpus):
    allwords=[]
    annotated={}
    for sent in corpus:
        for word in wt(sent):
            allwords.append(word.lower())
    print(len(set(allwords)), "unique characters in corpus")
    #maxcorp=int(input("Enter desired number of vocabulary: "))
    maxcorp=int(len(set(allwords))/1.1)
    wordcount = Counter(allwords).most_common(maxcorp)
    allwords=[]
    
    for p in wordcount:
        allwords.append(p[0])  
        
    allwords=list(set(allwords))
    
    print(len(allwords), "unique characters in corpus after max corpus cut")
    #integer encode
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(allwords)
    #one hot
    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    #make look up dict
    for k in range(len(onehot_encoded)): 
        inverted = cleantext(label_encoder.inverse_transform([argmax(onehot_encoded[k, :])])[0]).strip()
        annotated[inverted]=onehot_encoded[k]
    return label_encoder,onehot_encoded,annotated
예제 #23
0
	def encode(self, corpus):
		all_words = []
		one_hot = {}
		for sent in corpus:
			for word in wt(' '.join(sent)):
				all_words.append(word.lower())
		#print(len(set(all_words)), "unique words in corpus")
		logger.info(str(len(all_words)) + 'unique words in corpus')
		#maxcorp=int(input("Enter desired number of vocabulary: "))
		maxcorp = int(len(set(all_words)) / 1.1)
		wordcount = Counter(all_words).most_common(maxcorp)
		all_words = []

		for p in wordcount:
			all_words.append(p[0])  
		    
		all_words = list(set(all_words))

		#print(len(all_words), "unique words in corpus after max corpus cut")
		#logger.info(str(len(all_words)) + 'unique words in corpus after max corpus cut')
		#integer encode
		#label_encoder = LabelEncoder()
		#integer_encoded = label_encoder.fit_transform(all_words)
		#one hot
		label_encoder = LabelEncoder()
		integer_encoded = label_encoder.fit_transform(all_words)
		onehot_encoder = OneHotEncoder(sparse=False)
		#integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
		onehot_encoded = onehot_encoder.fit_transform(np.array(all_words).reshape(-1, 1))
		for i in range(len(onehot_encoded)):
			word = label_encoder.inverse_transform([argmax(onehot_encoded[i, :])])[0].strip()
			one_hot[word] = onehot_encoded[i]
		#print(len(one_hot.keys()))
		return one_hot
예제 #24
0
def makeEmbeddingMap(text, corpusCounts, numgram, maxperkey):
    if numgram < 2:
        return -1
    freqs = getNgramCounts(text, numgram)
    topgrams = getNMax(freqs, 2000)
    stemmedWords = {}
    onlyStem = {}
    for entry in topgrams:
        tokens = wt(entry[0])
        tokens = [stemmer.stem(token) for token in tokens]
        stem = " ".join(tokens)
        if stem in stemmedWords:
            stemmedWords[stem][entry[0]] = entry[1]
            onlyStem[stem] += entry[1]
        else:
            stemmedWords[stem] = {}
            stemmedWords[stem][entry[0]] = entry[1]
            onlyStem[stem] = entry[1]
    topgrams = getNMax(onlyStem, 1500)
    vectorMap = {}
    for word in corpusCounts:
        vectorMap[word[0]] = list()
        for gram in topgrams:
            if len(vectorMap[word[0]]) - 1 < maxperkey and word[0] in gram[
                    0] and not gram[0] in vectorMap[word[0]]:
                vectorMap[word[0]].append(gram[0])
    return vectorMap
예제 #25
0
def normalize_query_and(q):
    q = wt(q)
    normalized_q = []
    for word in q:
        if word not in punctuation:
            normalized_q.append(stemmer.stem(word))
    return normalized_q
예제 #26
0
def feature_extractor(preprocessed_tweet):
    feature = {}
    word_and_tag = nltk.pos_tag(wt(preprocessed_tweet))

    # add pos has a feature too if the accuracy is less
    all_tags = []
    all_words = []

    for (w,t) in word_and_tag:
        all_tags.append(t)
    
    for (w,t) in word_and_tag:
        if t in ["VB","VBD","VBN","VBP","VBZ","VBG"]:
            all_words.append(w)
        if t in ["JJ","JJR","JJS"]:
            all_words.append(w)
        if t in ["RB","RBS","RBR"]:
            all_words.append(w)
        if t in ["WRB","MD","IN","RP","CD","NN","NNP"]:
            all_words.append(w)
            
    for word in feature_bag_of_word:
        feature[word] = (word in all_words)

    for tag in tags:
        feature[tag] = (tag in all_tags)
        
    return feature
예제 #27
0
    def proc_sentence(sentence):
        raw_text = sentence.find('text').text.lower()
        opinions = sentence.find('Opinions')
        if opinions is None:
            return (wt(raw_text), ['O'] * len(wt(raw_text)))

        # make sure the target words will be separated by inserting spaces
        text_for_tokenization = raw_text[:].replace('/', ' / ')
        for opinion in opinions:
            if 'target' in opinion.attrib and opinion.attrib[
                    'target'] is not 'NULL':
                text_for_tokenization = text_for_tokenization.replace(
                    opinion.attrib['target'],
                    ' ' + opinion.attrib['target'] + ' ')

        tokens = wt(text_for_tokenization)
        spans = get_spans(raw_text, tokens)
        char_idx_to_word_idx = {s[1]: idx
                                for idx, s in enumerate(spans)
                                }  # map origin index to the tokenized words
        tags = ['O'] * len(spans)
        #print(char_idx_to_word_idx)

        if opinions is not None:
            for opinion in opinions:
                if 'from' not in opinion.attrib:
                    continue
                sidx = int(opinion.attrib['from'])
                eidx = int(opinion.attrib['to'])
                if sidx == eidx == 0:
                    continue
                token_sidx, token_eidx = 1000, 0
                tag = 'B'
                for idx in range(sidx, eidx):
                    if idx in char_idx_to_word_idx:
                        token_sidx = min(token_sidx, char_idx_to_word_idx[idx])
                        token_eidx = max(token_eidx, char_idx_to_word_idx[idx])
                        tags[char_idx_to_word_idx[idx]] = 'B'
                for idx in range(token_sidx, token_eidx + 1):
                    tags[idx] = tag
                    tag = 'I'
                if sidx not in char_idx_to_word_idx:
                    print('warning', tokens, text_for_tokenization, sidx,
                          spans, zip([s[0] for s in spans], tags))
                    #raise Exception('warning', tokens, text_for_tokenization, sidx, spans, zip([s[0] for s in spans], tags))

        return (tokens, tags)
def predict(trained_model, testing_path):
    print("Start predict module")
    try:
        with open('test_featuresets.dmp', 'rb') as fp:
            print("Test featuresets found!")
            documents = pickle.load(fp)
    except:
        print("No existing test featuresets. Make a new one.")

        from os import listdir
        pospath = testing_path + r"\pos"
        negpath = testing_path + r"\neg"
        posfiles = listdir(pospath)
        negfiles = listdir(negpath)

        documents = []
        for fn in posfiles:
            ff = open(pospath + '/' + fn, encoding='windows-1252')
            ftok = [word.lower() for word in wt(ff.read())]
            fset = set(ftok)
            tmpdict = {}
            for key in fset:
                tmpdict[key] = ftok.count(key)
            documents.append((tmpdict, 'pos'))
        for fn in negfiles:
            ff = open(negpath + '/' + fn, encoding='windows-1252')
            ftok = [word.lower() for word in wt(ff.read())]
            fset = set(ftok)
            tmpdict = {}
            for key in fset:
                tmpdict[key] = ftok.count(key)
            documents.append((tmpdict, 'neg'))

        with open('test_featuresets.dmp', 'xb') as fp:
            pickle.dump(documents, fp)

    model_predictions = []
    ground_truth = []
    count = 0
    for body, senti in documents:
        count += 1
        model_predictions.append(trained_model.classify(body))
        ground_truth.append(senti)
        print("Doc " + str(count) + " Done")

    print("Finish predict module")
    return model_predictions, ground_truth
예제 #29
0
def normalize_and():
    for doc in content:
        doc = wt(doc.lower())
        sentence = []
        for word in doc:
            if word not in punctuation:
                sentence.append(stemmer.stem(word))
        processed_docs.append(sentence)
예제 #30
0
def parseStopWords(sentence):
    words=wt(sentence)
    english=st.words(english)
    ss=''
    for word in words:
        if(word.lower() not in english):
            ss=word+' '
    return ss
예제 #31
0
def removeURLs(unprocessed_dataset_with_label):
    for tweet in unprocessed_dataset_with_label:
        tweet_words_array = wt(tweet["tweet"])        
        for word in list(tweet_words_array):
            if word == 'https' or word == 'https…' or word == 'http' or word == 'http…' or word == ':' or word[:2] == '//':
                tweet_words_array.remove(word)                
        tweet["tweet"] = " ".join(tweet_words_array)
    return unprocessed_dataset_with_label
예제 #32
0
def normalize_and():
    for doc in content:
        doc = wt(doc.lower())
        sentence = []
        for word in doc:
            if word not in punctuation:
                sentence.append(stemmer.stem(word))
        processed_docs.append(sentence)
예제 #33
0
def get_sents_vector(sents, model):
    """
    Function to convert given sentence into vectors
    :param sents: tokenized sentence
    :param model: model
    :return: sentence vectors
    """
    return vectorize_sentence(wt(sents), model)
예제 #34
0
def get_inverted_index(docs):
    tokesize = lambda x: wt(x)
    tokens_array = list(map(tokesize, docs))

    inverted_index = {}
    _ = [[inverted_index.setdefault(word, []).append(i) \
          for word in tokens_array[i]] \
         for i in range(len(tokens_array))]
    return inverted_index
예제 #35
0
	def process_query(self):
		"""Q.process_query() -- processes the user query, 
		by tokenizing and stemming words.
		"""
		self.query = wt(self.query)
		self.processed_query = []
		for word in self.query:
			if word not in self.stop_words and word not in self.punctuation:
				self.processed_query.append(self.stemmer.stem(word))
예제 #36
0
def normalized_and():
	processed_docs = []
	for doc in content:
		doc = wt(doc)
		sentence = []
		for word in doc:
			if word not in punctuation:
				sentence.append(word)
		processed_docs.append(sentence)
예제 #37
0
def remove_stop_words_and_stem(soup):
    result = get_soup_text(soup)
    tokenized_text = wt(result)
    filtered_array = []
    for word in tokenized_text:
        # Eliminating the stopwords
        if word not in stop_words and word.isalpha():
        # Only considering alphabetical strings
            filtered_array.append(stemmer.stem(word))
    return TextBlob(' '.join(filtered_array))
예제 #38
0
def tokenize(source):
	words = wt(source.lower().replace("can't", "can").replace("won't", "will").replace("gonna", "go"))
	words = filter(lambda x:x.isalpha() and len(x) > 1, words)
	reformed = map(lambda x: (morphy(x) or x), words)
	made = []
	count = len(reformed)
	for word in reformed:
		if word not in made:
			made.append(word)
	return made, count
예제 #39
0
def normalize_query(q):
	q = re.sub(r'(,)([0-9]+)', r"\1 \2", q.lower())
	q = re.sub(r'boundary', "four six", q)
	q = wt(q)
	normalized_q = []
	for word in q:
		if word not in stop and word not in punctuation:
			if word in no_replace_query.keys():
				word = no_replace_query[word]
			normalized_q.append(stemmer.stem(word))
	return normalized_q
예제 #40
0
def normalize():
	for doc in content:
		doc = re.sub(r'(,)([0-9]+)', r"\1 \2", doc.lower())
		doc = wt(doc)
		sentence = []
		for word in doc:
			if word not in stop and word not in punctuation:
				word = stemmer.stem(word)
				if word in no_replace.keys():
					word = no_replace[word]
				sentence.append(word)
		processed_docs.append(sentence)
예제 #41
0
	def process_corpus(self):
		"""Q.process_corpus() -- processes the queries defined by us, 
		by tokenizing, stemming, and removing stop words.
		"""
		for doc in self.corpus_list:
			doc = wt(doc)
			sentence = []
			for word in doc:
				if word not in self.stop_words and word not in self.punctuation:
					word = self.stemmer.stem(word)
					sentence.append(word)
			self.processed_corpus.append(sentence)
예제 #42
0
	def simHelper(T):
		'''
		Given a token returns a pos tagged list 
		'''
		alphanum = letters+octdigits

		# part of speech word list for the text
		fullList = [word for subl in [pos_tag(wt(s)) for s in st(T)] for word in subl]

		# remove symbols and -NONE- tags from list by checking the first character of the word and tag
		posList = [word for word in fullList if word[1][0] in alphanum and word[0][0] in alphanum]

		return posList
예제 #43
0
def initial_work():
    with open('C:/Users/Isha/Desktop/AldaSampling/data/StopWords.txt', 'rb') as infile:
            splitwordslist = infile.read().decode('UTF-8')
    myfilehtml = open('C:/Users/Isha/Desktop/AldaSampling/output/Processed/unsponsored/1010023_raw_html.txt')
    soup = BeautifulSoup(myfilehtml,'html.parser')
    for scriptdata in soup.findAll('script'):
        scriptdata.extract()
    result = soup.get_text().lower()
    output=open("output.txt","w")
    output.write(result)
    tokenized_text = wt(result)
    stop_words = wt(splitwordslist)
    for words in tokenized_text:
        if words not in stop_words:
            if words.isalpha():
                filtered_array.append(ps.stem(words))
    print(len(filtered_array))
    print(filtered_array)

    filtered_doc = ' '.join(filtered_array)
    print(filtered_doc)
    list_of_docs.append(tblob(filtered_doc))
    list_of_docs.append(tblob('experi unicorn astro table'))
    print(list_of_docs)
예제 #44
0
def filter_sentences(corpus='rbc_se.txt', dic='names.txt', newfile='rbc_sent_filt_new.txt'):
    name_dict = set([x.strip('\n') for x in open('.\\preprocessing\\' + dic, encoding='utf-8')])
    text = open(corpus, encoding='utf-8')
    new_file = open(newfile, 'w', encoding='utf-8')
    for line in text:
        tokens = wt(line.strip('\n'))
        for word in tokens:
            if word in punkt:
                continue
            word = word.lower()
            if word in name_dict:
                new_file.write(line)
                break
    text.close()
    new_file.close()
예제 #45
0
 def pos_tag(self, data, columns, tag):
     result = pd.DataFrame(columns=columns)
     data = data.dropna()
     result.title = data.title
     for index, row in data.iterrows():
         if not isinstance(row['summary'], int):
             #temp = self.normalize(row['summary'].strip('#'))
             temp = row['summary']
             pos = pos_tag(wt(temp))
             temp = " ".join(noun[0] for noun in pos if noun[1] in tag)
             print temp
             result.loc[index, 'summary'] = unicode(temp)
     # for c in columns:
     #     for index, row in data.iterrows():
     #         if not isinstance(row[c], int):
     #             temp = self.normalize(row[c])
     #             pos = pos_tag(wt(temp))
     #             temp = " ".join(noun[0] for noun in pos if noun[1] in tag)
     #             result.loc[index, c] = unicode(temp)
     return result.dropna()
예제 #46
0
	def summarize(self, text, n):

		sents = st(text)
		assert n <= len(sents)
		# assert is a way of making sure a condition holds true
		# will throw error if it is false

		word_sent = [wt(s.lower()) for s in sents]
		# list of lists of all the sentences 
		self._freq = self._compute_frequencies(word_sent)
		ranking = defaultdict(int)
		for i,sent in enumerate(word_sent):
			# enumerate creates a tuple with index,element for each entry in the list
			# allows need for a counter variable, but makes it easy to index
			for word in sent:
				if word in self._freq:
					ranking[i] += self._freq[word]

		sents_idx = nlargest(n,ranking, key = ranking.get)
		return [sents[j] for j in sents_idx] # sexy list comprehension
def prepareData(CandidateList):
    positiveText = ""
    negativeText = ""
    neutralText = ""

    vectors = []
    labels = []

    for candidate in CandidateList:
        positiveDict = candidate.positive
        for item  in positiveDict:
            text = positiveDict[item].tweet
            positiveText += text
            vec = text.split()
            vectors.append(vec)
            labels.append("positive")
    for candidate in CandidateList:
        negativeDict = candidate.negative
        for item  in negativeDict:
            text = negativeDict[item].tweet
            negativeText += text
            vectors.append(vec)
            labels.append("negative")
    for candidate in CandidateList:
        neutralDict = candidate.neutral
        for item  in neutralDict:
            text = neutralDict[item].tweet
            neutralText += text
            vectors.append(vec)
            labels.append("neutral")
    positiveTokens = wt(positiveText)
    negativeTokens = wt(negativeText)
    neutralTokens = wt(neutralText)

    positiveDist = freq(positiveTokens)
    negativeDist = freq(negativeTokens)
    neutralDist = freq(neutralTokens)

    tempVector = defaultdict()

    mostCount = 30
    mostPositive = positiveDist.most_common(mostCount)
    mostNegative = negativeDist.most_common(mostCount)
    mostNeutral = neutralDist.most_common(mostCount)




    for mytuple in positiveDist.items():
        if mytuple not in mostPositive and mytuple[1] > 1:
            tempVector[len(tempVector)] = mytuple[0]
    for mytuple in negativeDist.items():
        if mytuple not in mostNegative and mytuple[1] > 1:
            tempVector[len(tempVector)] = mytuple[0]
    for mytuple in neutralDist.items():
        if mytuple not in mostNeutral and mytuple[1] > 1:
            tempVector[len(tempVector)] = mytuple[0]

    print len(tempVector)
    tempvector = {tempVector[w]: w for w in tempVector}
    print len(tempvector)
    return (vectors,labels,tempvector)
예제 #48
0
		# average paragraph size
		wst = WhitespaceTokenizer()
		paraWordCounts = [len(wst.tokenize(p)) for p in paragraphs]

		# the approximate number of words in the document
		numWords = sum(paraWordCounts)

		# the average number of words per paragraph
		avgParagraphLen = mean(paraWordCounts)

		# rejoin the paragraphs
		text = ' '.join(paragraphs)

 		# part of speech word list for the text
 		text = [word for subl in [pos_tag(wt(s)) for s in st(text)] for word in subl]

 		# remove symbols from list by checking the first character of the word
 		text = [word for word in text if word[0][0] in alphanum]

 		# convert words to lowercase and convert Penn Tree Bank tags to WordNet tags
 		text = [(word[0].lower(), convertTag(word[1])) for word in text]

 		# remove Nones
 		text = [word for word in text if word[1]]

 		nouns = [word for word in text if word[1] == 'n']
 		numNouns = len(nouns)

 		verbs = [word for word in text if word[1] == 'v']
 		numVerbs = len(verbs)
예제 #49
0
# This shit doesn't make sense but oh well. The Experiment has been experimented.

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize as wt

ps = PorterStemmer()

# words = ["python", "pythoner", "pythoning", "pythoned", "pythonly"]

# for w in words:
# 	print ps.stem(w)

new_text = "It is very important to be pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once"
words = wt(new_text)

for w in words:
	print ps.stem(w)
예제 #50
0
def __extractSynSets(T):
	'''
	Given a text T (as a string) find all words that have WordNet synsets
	@return a unique list of SynSet objects
	'''

	'''
	CONSTANTS
	'''
	nounTags = ['NN','NNP','NNS','NNPS']
	verbTags = ['VB','VBD','VBG','VBN','VBP','VBZ']
	adjTags = ['JJ','JJR','JJS']
	advTags = ['RB','RBR','RBS']
	alphanum = letters+octdigits

	
	def convertTag(tag):
		'''
		Converts a Penn Tree Bank POS tag to a WordNet
		@return the converted tag otherwise None
		'''
		if tag in nounTags:
			return 'n'
		elif tag in verbTags:
			return 'v'
		elif tag in adjTags:
			return 'as' # adjectives in WordNet can be head adj 'a' or satellite adj 's'
		elif tag in advTags:
			return 'r'
		else:
			return None 
	
	def getSynSet(w):
		'''
		For a word 'w' with POS tag 'tag' find the corresponding WordNet synset
		@return the best matching sysnset for 'w' otherwise None
		'''
		tag = w[1]
		word = w[0]

		# get the list of possible synsets for w
		sets = wn.synsets(word)
		
		if not tag or sets == []:
			return None

		# look through the list of possible synsets for the first one w/ a pos tag that matches 'tag'
		for s in sets:
			if s.pos in tag:
				return s

		return None

	# part of speech word list for the text
	fullList = [word for subl in [pos_tag(wt(s)) for s in st(T)] for word in subl]

	# remove symbols and -NONE- tags from list by checking the first character of the word and tag
	posList = [word for word in fullList if word[1][0] in alphanum and word[0][0] in alphanum]

	# convert words to lowercase and convert Penn Tree Bank tags to WordNet tags
	posList = [(word[0].lower(), convertTag(word[1])) for word in posList]

	# remove words for which there is no WordNet tag (i.e. tag is None) and remove duplicate values
	posList = list(set([word for word in posList if word[1]]))

	# for the words in the POS list create a list of syn sets using their tags (remove None values)
	synSets = [n for n in [getSynSet(w) for w in posList] if n] 

	return synSets
예제 #51
0
'''
    Beautifier is a module that does NLP on HTML files from the data for this project
'''

from bs4 import BeautifulSoup as bs
from nltk.tokenize import word_tokenize as wt
from nltk.stem import PorterStemmer
from textblob import TextBlob

stop_words = []
stemmer = PorterStemmer()

with open('res/StopWords.txt', 'rb') as infile:
    stop_words_file = infile.read().decode('UTF-8')
    stop_words = set(wt(stop_words_file))

def get_soup_text(soup):
    for scriptdata in soup.findAll(["script", "style"]):
        scriptdata.extract()
    return soup.text.lower()

def remove_stop_words_and_stem(soup):
    result = get_soup_text(soup)
    tokenized_text = wt(result)
    filtered_array = []
    for word in tokenized_text:
        # Eliminating the stopwords
        if word not in stop_words and word.isalpha():
        # Only considering alphabetical strings
            filtered_array.append(stemmer.stem(word))
    return TextBlob(' '.join(filtered_array))
def clean_tokens():
    n_topics =100 

    # get all tweets
    tw = [line.strip('\n') for line in file('corpus_full')]

    # lower case and tokenize
    print 'Lower casing'
    tokens = [[word.lower() for word in wt(tt)] for tt in tw]

    # filter punctuations
    print 'Filtering punctuations'
    punc = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*',\
            '@', '#', '$', '%']
    tokens_unpunc = [[w for w in tk if not w in punc] for tk in tokens]

    # filter stopping words
    print 'Filtering stopping words'
    english_stopwords = stopwords.words('english')
    tokens_filtered = [[w for w in tk if not w in english_stopwords] for tk in\
            tokens_unpunc]

    # stemming
    print 'Stemming words'
    st = LancasterStemmer()
    tokens_stemmed = [[st.stem(w) for w in tk] for tk in tokens_filtered]

    # eliminate words with count == 1
    '''
    print 'Eliminating words appear once'
    all_items = sum(tokens_stemmed, [])
    print len(all_items)
    print 'Building once'
    once = set(t for t in set(all_items) if all_items.count(t) == 1)
    print 'Generating final tokens'
    final_tokens = [[s for s in tk if s not in once] for tk in tokens_stemmed]
    '''

    # eliminate some specific words and words that appear only once
    count = collections.defaultdict(int)
    for t in tokens_stemmed:
        for w in t:
            if w == 'http' or w[0 : 6] == '//t.co':
                print w
            else:
                count[w] += 1
    tokens_stemmed = [[st.stem(w) for w in tk] for tk in tokens_filtered]
    tokens_final = [[w for w in tk if count[w] > 1] for tk in tokens_stemmed]

    # LDA
    logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s',\
            level = logging.INFO)
    dictionary = gensim.corpora.Dictionary(tokens_final)
    print 'Building corpus for LDA'
    corpus = [dictionary.doc2bow(t) for t in tokens_final]
    print 'LDA'
    lda = gensim.models.ldamodel.LdaModel(corpus = corpus, id2word = dictionary,
            num_topics = n_topics)
    print lda.print_topics()


    # extract topics for tweets
    topic_matrix = []
    for i in range(98900):
        topics = lda[dictionary.doc2bow(tokens_final[i])]
        v = [0.] * n_topics
        for t in topics:
            v[t[0]] = t[1]
        topic_matrix.append(v)

    # write matrix to the disk
    np.save('topic_matrix', topic_matrix)

    # write topics to the disk
    topics = lda.show_topics(-1)
    with open('topics', 'a') as f:
        for i, t in enumerate(topics):
            f.write(str(i) + '-' + t + '\n')
예제 #53
0
'''
Created on Mar 24, 2011

@author: Blodstone
'''

from nltk import ne_chunk as nc
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize as wt

tokenizeWords = wt('Who is Samuel Pickering ?')
pos = pos_tag(tokenizeWords)
n = nc(pos)
print n