Пример #1
0
def lemma_tokenize(paragraph):
    lmtzr = WordNetLemmatizer()
    try:
        return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
    except LookupError:
        nltk.download('wordnet')
        return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
Пример #2
0
def parse_stories(lines, only_supporting=False):
    '''Parse stories provided in the bAbi tasks format
    If only_supporting is true, only the sentences that support the answer are kept.
    '''
    data = []
    story = []
    for line in lines:
        line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        nid = int(nid)
        if nid == 1:
            story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            if only_supporting:
                # Only select the related substory
                supporting = map(int, supporting.split())
                substory = [story[i - 1] for i in supporting]
            else:
                # Provide all the substories
                substory = [x for x in story if x]
            data.append((substory, q, a))
            story.append('')
        else:
            sent = tokenize(line)
            story.append(sent)
    return data
def generation(cluster_matrix):
    # Alternative distance metric:
    # for each storyline-word, find most similar words;
    # filter for words that are reasonably close to other storyline-words
    allpool = []
    for cluster in cluster_matrix:
        pool = []
        try:
            for word in cluster:
                # find candidate words
                try:
                    thisword_tk = nltk.tokenize(word)
                    thistag_tk = nltk.pos_tag(thisword_tk)
                    if (thisword_tk[1] not in tagdict):
                        continue
                except:
                    continue
                cand = [
                    tup[0] for tup in glove_model.most_similar(word, topn=200)
                ]
                # calculate distances from candidate words to other storyline-words
                cand2clust_dists = np.sum([
                    glove_model.distances(x, cand)
                    for x in cluster if x != word
                ],
                                          axis=0)
                # indexes of qualified words in cand (comparing among themselves)
                indexes = cand2clust_dists.argsort()[:200]  # get top 25
                keep = set()
                tagdict = [
                    'NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ',
                    'JJR', 'JJS', 'RB', 'RBR', 'RBS'
                ]
                print(cand)
                smallest = len(cand)
                if smallest > 200:
                    smallest = 200
                for i in range(0, smallest):
                    try:
                        word_tk = nltk.tokenize(cand[i])
                        tag_tk = nltk.pos_tag(word_tk)
                        if (tag_tk[1] in tagdict):
                            keep.add(cand[i])
                    except:
                        continue
                    if len(keep) == 25:
                        break
                # OR, comparing with all vocab
                # indexes of words whose total distance to other storyline-words is among top 1% of all vocab
                #top_dist = np.percentile(np.sum([glove_model.distances(x) for x in cluster if x!=word], axis=0),1)
                #keep = [cand[i] for i in range(len(cand2clust_dists)) if cand2clust_dists[i] <= top_dist]
                pool = pool + keep
                print(pool)
            allpool = allpool + pool
        except:
            print("Sad!!")
            return None
    return allpool
Пример #4
0
def jaccard_similarity(a, b, threshold=0.5):
    """Check if a and b are matches."""
    tokens_a = [token.lower().strip(string.punctuation) for token in tokenize(a) \
    if token.lower().strip(string.punctuation) not in stopwords]
    tokens_b = [token.lower().strip(string.punctuation) for token in tokenize(b) \
    if token.lower().strip(string.punctuation) not in stopwords]

    # Calculate Jaccard similarity
    ratio = len(set(tokens_a).intersection(tokens_b)) / float(
        len(set(tokens_a).union(tokens_b)))
    return ratio
Пример #5
0
def compute_similarity(text1, text2):
    w1, w2 = tokenize(text1), tokenize(text2)
    # create a sorted common vocabulary for the two lists of words
    common_vocab = sorted(set(w1) | set(w2))

    # create count vectors of the same lenght for the two lists of words
    v1 = count_vectorize(w1, common_vocab)
    v2 = count_vectorize(w2, common_vocab)

    distance = compute_distance(v1, v2)
    return distance
Пример #6
0
def chat_with_robo():
    parser = Parser()

    flag = True
    print("The instructions for talk with me: \n",
          "If you want finish the conversation, please type thanks or bye.\n")
    print("ROBO: Hi, my name is Robo.")
    while flag == True:
        message = input()
        message = message.lower()

        if message != 'bye':
            # Analyzing the input
            print('\nvocabulary: ', nltk.tokenize(message))
            print('\nword frequency: ' +
                  nltk.FreqDist(nltk.tokenize(message)).most_common(10))

            # -----------
            # add part-of-speech tags to text
            # -----------
            # Tagging message with basic nltk tokenize
            print(nltk.pos_tag(nltk.word_tokenize(message)))
            # Tiene problemas con la identificación del pronombre 'I', lo pone como noun (sustantivo)

            # Tagging message

            # trace = 1: then the parser will report the steps that it takes as it parses a text.
            # rd_parser = nltk.RecursiveDescentParser(, trace = 1)

            # Review grammar
            # rd_parser = nltk.RecursiveDescentParser(nltk.ChartParser)
            rd_parser = parser.parse(message)

            i = 1

            for tree_struc in rd_parser:
                print(str(i) + 'tree_struc: ', tree_struc)

                wrong_syntax = 1
                s = tree_struc
                wrong_syntax = 0
                print("\n Correct Grammar")
                i += 1
            if wrong_syntax == 1:
                print("\n Wrong Grammar")

                # write_output_file(...

        else:
            flag = False
            print("ROBO: Bye! take care..")
Пример #7
0
def is_text_initial(term, text, start_within=5, ignore_case=True):
    if type(term) != type(list()):
        if ignore_case: term = term.lower()
        term_tokens = tokenize(term)
    else:
        term_tokens = [token.lower() for token in term] if ignore_case else term
    if type(text) != type(list()):
        if ignore_case: text = text.lower()
        text_tokens = tokenize(text)
    else:
        text_tokens = [token.lower() for token in text] if ignore_case else text
    spacey_text = ' '+(' '.join(text_tokens[:start_within-1+len(term_tokens)]))+' '
    spacey_term = ' '+(' '.join(term_tokens))+' '
    return spacey_term in spacey_text
Пример #8
0
def predict(text):

    tokens = tokenize(text)

    i = 0
    prep, origin = prepare_tokens(tokens[i:i + 200])

    pred = model.predict(prep)

    origin = fix_holes(origin, pred)

    while i * 200 < len(origin):

        p, o = prepare_tokens(tokens[i:i + 200])
        i += 1
        pred = model.predict(p)

        o = fix_holes(o, pred)

        origin = origin + o

    return origin, avg(origin)


#print(prepare_text(a), model.predict(pa))
 def process_line(num, line):
   global docs
   docs.append(doc2vec.LabeledSentence(words=nltk.tokenize(line), labels=["SENT_" + str(num)]))
   if (len(docs) > 100):
     doc2vec_model.build_vocab(docs)
     doc2vec_model.train(random.shuffle(docs))
     docs = []
Пример #10
0
def load_lists():
    truelist = set()
    phrase_truelist = defaultdict(set)
    module_file = inspect.getfile(inspect.currentframe())
    module_dir = os.path.dirname(os.path.abspath(module_file))
    truelist_file = os.path.join(module_dir, "truelist")
    for line in open(truelist_file):
        line = line.split("#")[0].strip()
        if line == "":
            continue
        assert not any(
            is_hyphen(c) for c in
            line), f'Truelist entries should not contain hyphens: {line}'
        if ' ' not in line:
            truelist.add(line)
        else:
            toks = tuple(tokenize(line))
            phrase_truelist[len(toks)].add(
                toks)  # group phrases by number of tokens
    phrase_truelist = sorted(phrase_truelist.items(),
                             reverse=True)  # bins sorted by phrase length
    special_file = os.path.join(module_dir, "special-case-titles")
    with open(special_file) as inF:
        special_titles = {
            line.strip().lower(): line.strip()
            for line in inF if line.strip()
        }
    amodifiers = (
        'North',
        'South',
        'East',
        'West',
        'Northeast',
        'Northwest',
        'Southeast',
        'Southwest',
        'Central',
        'Northern',
        'Southern',
        'Eastern',
        'Western',
        'Northeastern',
        'Northwestern',
        'Southeastern',
        'Southwestern',
        'Modern',
        'Ancient',
    )  # use subsequent word to determine fixed-case. will miss hyphenated modifiers (e.g. South-East)
    ndescriptors = (
        'Bay',
        'Coast',
        'Gulf',
        'Island',
        'Isle',
        'Lake',
        'Republic',
        'University',
    )  # use preceding word to determine fixed-case

    return truelist, phrase_truelist, special_titles, amodifiers, ndescriptors
def junk_count(html):
    tokens = tokenize(BS(html).get_text())
    tokens = [
        token for token in tokens
        if not token.isalpha() and token not in string.punctuation
    ]
    return len(tokens)
Пример #12
0
def predict(text):

    tokens = tokenize(text)

    i = 0
    prep, origin = prepare_tokens(tokens[i:i + 200])

    pred = model.predict(prep)

    origin = fix_holes(origin, pred)

    score = overall_model.predict(prep)[0][0]
    cnt = 1

    while i * 200 < len(origin):

        p, o = prepare_tokens(tokens[i:i + 200])
        i += 1
        pred = model.predict(p)

        o = fix_holes(o, pred)

        origin = origin + o
        score += overall_model.predict(prep)[0][0]
        cnt += 1

    return origin, score / cnt


#print(prepare_text(a), model.predict(pa))
Пример #13
0
def prepare_text(text):
    tokens = tokenize(text)

    original = []

    filteredTokens = []

    for x in tokens:
        if x in w2v.vocab:
            filteredTokens.append(x)
            original.append(0.0)
        else:
            original.append(None)

    # for word in filteredTokens:
    #
    #
    #     output.append(w2v.word_vec(word))

    output = list(map(lambda word: w2v.word_vec(word), filteredTokens))

    if len(output) > 200:
        output = output[:200]
    else:

        while len(output) < 200:
            output.append([0.0] * 300)

    return np.array([output]), original
def get_tokenized_dialog_lines(iterable_dialog_lines):
    """
    Tokenizes with nltk tokenizer, adds START_TOKEND, EOS_SYMBOL
    :param iterable_dialog_lines: IterableSentences
    :return: IterableSentences
    """
    return iterable_dialog_lines.add_postprocessing(lambda x: [START_TOKEN] + tokenize(x) + [EOS_SYMBOL])
Пример #15
0
def rank_sentences( tagdict, tags, topK, cooccurances, probs):
	stop_lst = set(filter_reviews.get_stop_lst())
	punctuation = set(['.',',','?','!','\'','\"','`','``','*','-','/','+'])
	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	tokenizer2 = RegexpTokenizer(r'(\w|\')+')
	tag_sentence_rank = {}
	stemmer = PorterStemmer()
	tokenize = tokenizer2.tokenize
	stem = stemmer.stem
	for tag in tags:
		candidates = tagdict[tag]
		if len(candidates) <= topK:
			tag_sentence_rank[tag] = candidates
			continue

		scores = []
		for (reviewIdx, sentence) in candidates:
			score = 0
			tokens = tokenize(sentence)
			clean_line = [stem(token) for token in tokens if token not in stop_lst and token not in punctuation and token.isalpha()]
			score = ( score + score_sentence_tag(clean_line, tag, cooccurances, probs) + 1) / (1.0*len(clean_line) + 1.0)
			scores.append((score, (reviewIdx, sentence) ))
		#max(scores)
		ret = sorted(scores, key=lambda score_sent: score_sent[0], reverse=True)[:topK]
		tag_sentence_rank[tag] = ret
		#for i,pair in enumerate(candidates):
		#	if scores[i] >= lowest_score
	return tag_sentence_rank
Пример #16
0
 def post(self):
     text = self.get_argument("rawtext")
     relations = []
     entities = []
     tokens = []
     IN = re.compile(r'.*\bin\b')
     doc.headline = ['a']
     def tokenize(text):
         for sentence in nltk.sent_tokenize(text):
             for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence))):
                 if hasattr(chunk, 'node'):
                     if chunk.node != 'GPE':
                         tmp_tree = nltk.Tree(chunk.node, [(' '.join(c[0] for c in chunk.leaves()))])
                     else:
                         tmp_tree = nltk.Tree('LOCATION', [(' '.join(c[0] for c in chunk.leaves()))])
                     tokens.append(tmp_tree)
                     entities.append(tmp_tree)
                 else:
                     tokens.append(chunk[0])
         return tokens
     
     def extract_people_in_locations():
         for rel in nltk.sem.extract_rels('PERSON' , 'LOCATION', doc, corpus='ieer', pattern=IN):
             filler_tokens = dict(nltk.pos_tag(nltk.word_tokenize(rel['filler'])))
             tmp = rel['subjtext'] + " is in " + rel['objtext']
             relations.append(tmp)
                 
     doc.text = tokenize(text)
     #print doc.text
     extract_people_in_locations()
     
     self.render("extractor_post.html", text=text, entities=entities, relations=relations)
Пример #17
0
def comparison(p, plist):
    # print(p,plist)
    plisttok = tokenize_sents(plist)
    ptok = tokenize(p)
    # print(plisttok)
    data = rank_one.article2queries(plisttok, ptok, 1)
    return data[0]['driver']
Пример #18
0
def load_file(path):

    text = ''
    with open(path, 'r') as f:
        text = f.read()

    tokens = tokenize(text)

    filteredTokens = filter(lambda x: x in w2v.vocab, tokens)
    filteredTokens = list(filteredTokens)

    output = []

    # for word in filteredTokens:
    #
    #
    #     output.append(w2v.word_vec(word))

    output = list(map(lambda word: w2v.word_vec(word), filteredTokens))

    if len(output) > timesteps:
        output = output[:timesteps]
    else:

        while len(output) < timesteps:
            output.append([0.0] * 300)

    return np.array(output)
Пример #19
0
 def __iter__(self):
     with open(self.file,'r') as fp:
         line = fp.readline()
         while line :
             if line != '':
                 tockenLine = ''.join(tokenize(line))
                 word_sentences = [word for word in tockenLine.split()]
                 yield word_sentences
Пример #20
0
def preProcess(df):
    df.sentence = tokenize(df)
    df.sentence = removePunctuation(df)
    df.sentence = textNormalize(df)
    df.sentence = toLower(df)
    df.sentence = stemming(df)
    # return tfidf(df)
    return df.sentence
Пример #21
0
 def __iter__(self):
     with open(self.file, 'r') as fp:
         line = fp.readline()
         while line:
             if line != '':
                 tockenLine = ''.join(tokenize(line))
                 word_sentences = [word for word in tockenLine.split()]
                 yield word_sentences
Пример #22
0
    def train(self):
        with open("../../LSTM/data/sentiment/trainsentence_and_label_binary.txt", 'r') as filedata:
            data = filedata.readlines()

        tokenized_sentences_with_labels = []
        for sent in data:
            tokenized = nltk.tokenize(sent.lower())
            tokenized_sentences_with_labels.append((int(tokenized[0]), tokenized[1:]))
Пример #23
0
    def generateTextObservations(text):
        X = []
        text_tokens = nltk.tokenize(text)

        for word in text_tokens:
            X.append(vec[word])

        return X
def process_text():
    try:
        for words in tokenized[:5]:
            tokenized_words=nltk.tokenize(words)
            part_of_speech_tag = nltk.pos_tag(tokenized_words)
            print(part_of_speech_tag)

    except Exception as e:
        print(str(e))
Пример #25
0
def shinglize(s, n):
    """
    return size n shingles for the string s
    """
    shingles = set()
    tokens = tokenize(s)
    for i in range(len(tokens) - n + 1):
        shingles.add('_'.join(tokens[i:i+n]))
    return shingles
Пример #26
0
def shinglize(s, n):
    """
    return size n shingles for the string s
    """
    shingles = set()
    tokens = tokenize(s)
    for i in range(len(tokens) - n + 1):
        shingles.add('_'.join(tokens[i:i + n]))
    return shingles
def word_counts(html):
    lem = WordNetLemmatizer()
    tokens = tokenize(BS(html).get_text())
    tokens = [
        lem.lemmatizer(token.lowercase) for token in tokens
        if token not in string.punctuation
    ]

    return make_dict(tokens)
Пример #28
0
def get_tokens(string):
    global stop_words
    rtn = []
    tokens = tokenize(string)

    for token in tokens:
        if token.isalnum() and token not in stop_words:
            rtn.append(token)
    return rtn
Пример #29
0
 def _processline(self,line):
     tokens=["__START"]+tokenize(line)+["__END"]
     previous="__END"
     for token in tokens:
         self.unigram[token]=self.unigram.get(token,0)+1
         current=self.bigram.get(previous,{})
         current[token]=current.get(token,0)+1
         self.bigram[previous]=current
         previous=token
Пример #30
0
def getBagOfWords(categories, stars, maxNumberOfReviewToUse):
    reviews = getSetOfReviews(categories, stars, maxNumberOfReviewToUse)
    output = {} #string:int
    for currentReview in reviews:
        for token in nltk.tokenize(currentReview):
            if token in output.keys():
                output[token]+=1
            else:
                output[token] = 1
    return output
Пример #31
0
 def compute_prob_line(self,line,methodparams={}):
     #this will add _start to the beginning of a line of text
     #compute the probability of the line according to the desired model
     #and returns probability together with number of tokens
     
     tokens=["__START"]+tokenize(line)+["__END"]
     acc=0
     for i,token in enumerate(tokens[1:]):
         acc+=math.log(self.get_prob(token,tokens[:i+1],methodparams))
     return acc,len(tokens[1:])
Пример #32
0
def count_tokens_in_chunk(idx, chunk):
    print("Processing chunk", idx)
    counts = Counter()

    for dialog in chunk:
        for utterance in dialog:
            tokens = tokenize(utterance["text"])
            counts += Counter(tokens)

    return counts
Пример #33
0
def word_tokens(text_or_stream):
    def tokenize(text):
        hold_back = None
        skip = False

        for word in nltk.tokenize.word_tokenize(text):
            if hold_back is not None:
                if word == hold_back[0]:
                    yield Token(hold_back[0])
                    yield Token(hold_back[1])
                    yield Token(word)
                    skip = True
                else:
                    yield Token(hold_back[0] + hold_back[1])

                hold_back = None

            if not skip:
                if word.startswith(Token.APOSTROPHE):
                    # Use hold_back to fix tokenization errors of the form:
                    # | input  | output  | expected |
                    # | ------ | ------- | -------- |
                    # | 'word' | 'word ' | ' word ' |
                    hold_back = (word[0], word[1:])
                else:
                    hold_back = None

                if hold_back is None:
                    yield Token(word)

            skip = False

        if hold_back is not None:
            yield Token(hold_back[0] + hold_back[1])

    if isinstance(text_or_stream, str):
        for token in tokenize(text_or_stream):
            yield token
    else:
        for text in text_or_stream:
            for token in tokenize(text):
                yield token
def punc_count(html):
    tokens = tokenize(BS(html.get_text()))
    count = 0
    for token in tokens:
        if len(token) > 1:
            for char in token:
                if char in string.punctuation:
                    count += 1
                    break

    return count
def compute_yules_k_for_text(sentence):
    tokens = tokenize(sentence)
    counter = Counter(token.upper() for token in tokens)

    #compute number of word forms in a given sentence/text
    m1 = sum(counter.values())
    m2 = sum([frequency ** 2 for frequency in counter.values()])

    #compute yules k measure and return the value
    yules_k = 10000/((m1 * m1) / (m2 - m1))
    return yules_k
Пример #36
0
def tfidf(documents):
    tokenized_documents = [tokenize(d) for d in documents]
    idf = inverse_document_frequencies(tokenized_documents)
    tfidf_documents = []
    for document in tokenized_documents:
        doc_tfidf = []
        for term in idf.keys():
            tf = sublinear_term_frequency(term, document)
            doc_tfidf.append(tf * idf[term])
        tfidf_documents.append(doc_tfidf)
    return tfidf_documents
Пример #37
0
def is_match(a, b):
    """Check if a and b are matches."""
    pos_a = map(get_wordnet_pos, nltk.pos_tag(tokenize(a)))
    pos_b = map(get_wordnet_pos, nltk.pos_tag(tokenize(b)))
    lemmae_a = [
        lemmatizer.lemmatize(token.lower().strip(string.punctuation), pos)
        for token, pos in pos_a
        if pos == wordnet.NOUN and token.lower().strip(string.punctuation)
        not in stopwords
    ]
    lemmae_b = [
        lemmatizer.lemmatize(token.lower().strip(string.punctuation), pos)
        for token, pos in pos_b
        if pos == wordnet.NOUN and token.lower().strip(string.punctuation)
        not in stopwords
    ]

    # Calculate Jaccard similarity
    intersect = set(lemmae_a).intersection(lemmae_b)
    union = set(lemmae_a).union(lemmae_b)
    return len(intersect) / float(len(union))
Пример #38
0
def get_lines_for_validation(validation_set_path, index_to_token):
    with codecs.open(validation_set_path, 'r', 'utf-8') as dataset_fh:
        lines = dataset_fh.readlines()
        lines = [tokenize(line.strip()) for line in lines]
        screened_lines = get_transformed_dialog_lines(lines, index_to_token.values())

    # return true array, not iterator
    lines_for_validation = []
    for line in screened_lines:
        lines_for_validation.append(line)

    return lines_for_validation
Пример #39
0
def parse_text_to_stems(language, text, min_length=3):
    """ Parse a text attribute performing cleanup, tokenization, stemmization and removal of stop-words.

        :param language: The text language, relevant for stemmization.
        :param text: The text to be stemmized.
        :param min_length: The minimum number of characters that a word must have; otherwise it is discarded.

        :returns: A list of terms.
    """
    text = re.sub(" +", " ", text).lower()
    tokens = tokenize(text)
    stems = get_stems(tokens, language)
    return remove_stopwords(stems, language, min_length)
Пример #40
0
def file2tokenized_list(files, lower=True, encoding="utf-8"):
    """
    Takes a filepath, or a list of filepaths and returns lists of tokenized
    strings.

    You can actually also feed it a dictionary of file path lists, where each
    key of the dictionary represents some category. If you chose to feed it a
    dictionary, then the output will be a tuple with 3 values.
        tokenized_list = the usual lists of tokenized strings.
        labels         = a list containing integer labels corresponding to the
                         category that each element of tokenized_list belongs to
        cats           = A list of the unique category names. Indices of the
                         names correspond to the integer values used in labels
                         such that cats[labels[i]] gives you the original name
                         for the category that the ith training example belongs
                         to.

    :param files: (str, or list of strings or dict)
        String of a single file path, or a list of file path strings.
    :param lower: (bool)(default = True)
        Convert all text to lowercase?
    :param encoding: (str)(default = "utf-8")
        Encoding used in the text files
    :return:
        A list of lists of tokenized strings (if files is a string or a list of
        strings)

        A tuple of 3 items if files is a dictionary containing lists of strings.
        (see the description section for more details about the 3 elements
        returned)
    """
    # ==========================================================================
    #TODO: add argument replacements, which is a dictionary of regex replacements.
    #      whenever it encounters some pattern, replarce it with some other text.
    print("Generating a tokenised list from files")
    if isinstance(files, dict):
        return dict_file2tokenized_list(files, lower, encoding)
    if isinstance(files, str):
        files = [files]
    num_items = len(files)

    tokenized_list = ["MISSING"] * num_items    # Will store the tokenised text
    for i in range(num_items):
        with open(files[i], "r") as textFile:
            text = textFile.read()
        text = text.decode(encoding)
        if lower:
            text = text.lower()
        tokenized_list[i] = tokenize(text)
    print("---Done!")
    return tokenized_list
Пример #41
0
def main():
	data = "data/top_100_entities.txt"
	pathToData = "data/funnyReviews/"
	fileName = "rev_data_"
	suffix = ".txt"

	lines = []
	with open(data, 'r') as f:
		lines = f.readlines()
	
	entity_weight = []
	wn_entities = []	
	for line in lines:
		key =  line.split(",")[0].split(":")[1].strip()
		value = line.split(",")[1].split(":")[1].strip()
		#print wn.synsets(key, pos=wn.NOUN)
		wn_entities.append(wn.synsets(key, pos=wn.NOUN)[0])
		entity_weight.append( (key, value) )
	
	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	tokenizer2 = RegexpTokenizer(r'(\w|\')+')
	stemmer = PorterStemmer()
	tokenize = tokenizer2.tokenize
	stem = stemmer.stem
	stop_lst = get_stop_lst()	
	punctuation = set(['.',',','?','!','\'','\"','`','``','*','-','/','+'])

	review = ""
	with open(pathToData + fileName + str(1) + suffix, 'r') as f:
		review = f.read()


	review_lines = tokenizer.tokenize(review.lower())
	scores = []
	for sentence in review_lines:
		tokens = tokenize(sentence)
		clean_line = [stem(token) for token in tokens if token not in stop_lst and token not in punctuation and token.isalpha()]
		#print clean_line
		for item in clean_line:
			word1 = wn.synsets(item, pos=wn.NOUN)
			if len(word1) > 0:
				if word1[0] in wn_entities:
					print word1[0]
		

	# score = ( score + score_sentence_tag(clean_line, tag, cooccurances, probs) + 1) / (1.0*len(clean_line) + 1.0)
                        			
	
	return 0
def mark_possible_duplicates(dict_list, key):
    """
    Marks the possible duplicates strings
    """
    number_of_titles = len(dict_list)
    full_match_criteria = 0.9

    # Build a list of token sets for the strings in the key 
    token_list = []
    for i in range(number_of_titles):
        # Removes stop words to create token set
        value = tokenize(dict_list[i][key]) 
        value = remove_stop_words(value)
        token_list.append(value)

    # Dict of objects with the structure {(i:j) : score}
    similarity_map = {}
    score_threshold = 0.5

    # Indexes of tokens will match indexes in dict_list
    for i in range(number_of_titles):
        # Trying brute forces comparison of all items (O(n^2)/2), seems fast enough for this 
        for j in range(i+1, number_of_titles):
            score = get_token_set_match_ratio(token_list[i], token_list[j])
            if score >= score_threshold:
                similarity_map[(i, j)] = score

    partial_matches = 0 
    full_matches = 0 
    for k, val in sorted(similarity_map.iteritems()):
        # print(k)
        # print(val)
        if (val < full_match_criteria):
            # print(dict_list[k[0]][key]).encode('utf-8') 
            # print(dict_list[k[1]][key]).encode('utf-8') 
            # print('Similarity score: ' + str(val))  
            partial_matches +=1
        else:
            mark_database_for_full_match(k, dict_list)
            full_matches += 1
    
    add_match_clusters(similarity_map, dict_list)
    remove_match(similarity_map, dict_list, threshold = full_match_criteria)
    remove_ids_for_corrected_clusters(dict_list)

    print('Partial matches: ' + str(partial_matches))
    print('Full matches: ' + str(full_matches))

    return dict_list
def dict_file2tokenized_list(files, lower=True, encoding="utf-8"):
    """
    Takes a dictionary of file path lists, where each key of the dictionary
    represents some category.

    The output will be a tuple with 3 values.
        tokenized_list = lists of tokenized strings.
        labels         = a list containing integer labels corresponding to the
                         category that each element of tokenized_list belongs to
        cats           = A list of the unique category names. Indices of the
                         names correspond to the integer values used in labels
                         such that cats[labels[i]] gives you the original name
                         for the category that the ith training example belongs
                         to.

    :param files: (str, or list of strings or dict)
        String of a single file path, or a list of file path strings.
    :param lower: (bool)(default = True)
        Convert all text to lowercase?
    :param encoding: (str)(default = "utf-8")
        Encoding used in the text files
    :return:
        A tuple of 3 items if files is a dictionary containing lists of strings.
        (see the description section for more details about the 3 elements
        returned)
    """
    # ==========================================================================
    cats = files.keys()
    num_per_category = {cat: len(files[cat]) for cat in cats}
    num_items = sum(num_per_category.values())

    tokenized_list = ["MISSING"] * num_items  # Will store the tokenised text
    labels = ["MISSING"] * num_items  # Will store the labels

    running_index = 0
    for cat_i, cat in enumerate(cats):
        #num_items_for_cat =
        for example_i in range(num_per_category[cat]):
            with open(files[cat][example_i], "r") as textFile:
                text = textFile.read()
            text = text.decode(encoding)
            if lower:
                text = text.lower()
            tokenized_list[running_index] = tokenize(text)
            labels[running_index] = cat_i
            running_index += 1
    print("---Done!")
    return (tokenized_list, labels, cats)
Пример #44
0
 def get_prediction(inp):
     tokens = tokenize(inp)
     if len(tokens) > timesteps:
         import sys
         sys.stderr.write("Exceeding allowed input length. "
                          "Cutting off after {} tokens.".format(timesteps))
         tokens = tokens[:timesteps]  # cut off after max model timesteps
     vecs = vectorizer.transform([tokens])  # list of seqs
     vecs = vecs.repeat(batch_size, axis=0)
     input_lengths = np.array([len(tokens)] * batch_size)
     pred, _ = model.step(session, task, vecs, input_lengths, mode="decode")
     pred = pred.reshape([batch_size, timesteps, -1])
     # Find all candidate words per timestep (all words from top k clusters)
     candidates = {}
     for t in range(timesteps):
         if np.argmax(pred[0][t]) == dio.PAD_ID:
             break
         candidates[t] = set()
         # get the top k clusters
         # (http://stackoverflow.com/questions/6910641/)
         topclusters = np.argpartition(pred[0][t],
                                       -k_clusters)[-k_clusters:]
         for c in topclusters:
             candidates[t].update(task.i2l[c])  # expand with this cluster
     # Find the optimal sequence from the candidate words using beam search
     global lm
     if not lm:
         lm = kenlm.Model('data/lms/en-70k-0.2-pruned.lm')
     hypos = ["<s>"]
     for t in range(len(candidates)):
         t_hypos = []
         content_words = [
             w for w, t in nltk.pos_tag(tokens) if t in CONTENT_POS
         ]
         if prune:
             copies = candidates[t].intersection(set(content_words))
             if copies:
                 candidates[t] = copies
         for cand in candidates[t]:
             for h in hypos:
                 cand_t = h + " " + cand
                 score = lm.score(cand_t)  # get language model score
                 t_hypos.append((cand_t, score))
         # get beam_width highest scoring hypotheses
         hypos = [
             h for h, s in sorted(t_hypos, key=lambda x: x[1])[-beam_width:]
         ]
     return hypos[:-k_best:-1]  # k highest scoring hypos, revert list
def dict_file2tokenized_list(files, lower=True, encoding="utf-8"):
    """
    Takes a dictionary of file path lists, where each key of the dictionary
    represents some category.

    The output will be a tuple with 3 values.
        tokenized_list = lists of tokenized strings.
        labels         = a list containing integer labels corresponding to the
                         category that each element of tokenized_list belongs to
        cats           = A list of the unique category names. Indices of the
                         names correspond to the integer values used in labels
                         such that cats[labels[i]] gives you the original name
                         for the category that the ith training example belongs
                         to.

    :param files: (str, or list of strings or dict)
        String of a single file path, or a list of file path strings.
    :param lower: (bool)(default = True)
        Convert all text to lowercase?
    :param encoding: (str)(default = "utf-8")
        Encoding used in the text files
    :return:
        A tuple of 3 items if files is a dictionary containing lists of strings.
        (see the description section for more details about the 3 elements
        returned)
    """
    # ==========================================================================
    cats = files.keys()
    num_per_category  = {cat: len(files[cat]) for cat in cats}
    num_items = sum(num_per_category.values())

    tokenized_list = ["MISSING"] * num_items    # Will store the tokenised text
    labels = ["MISSING"] * num_items            # Will store the labels

    running_index = 0
    for cat_i, cat in enumerate(cats):
        #num_items_for_cat =
        for example_i in range(num_per_category[cat]):
            with open(files[cat][example_i], "r") as textFile:
                text = textFile.read()
            text = text.decode(encoding)
            if lower:
                text = text.lower()
            tokenized_list[running_index] = tokenize(text)
            labels[running_index] = cat_i
            running_index += 1
    print("---Done!")
    return (tokenized_list, labels, cats)
def get_input_sequence(sentence):
    """
    Prepare chatbot's input by tokenizing the sentence and adding necessary punctuation marks.
    Input: "So what's up, buddy"
    Output: ["so", "what", "'", "s", "up", ",", "buddy", ".", "$$$"]
    """
    if not sentence:
        return [START_TOKEN, EOS_SYMBOL]

    # add a dot to the end of the sent in case there is no punctuation mark
    if sentence[-1] not in _PUNKT_MARKS:
        sentence += '.'

    sequence = [START_TOKEN] + tokenize(sentence) + [EOS_SYMBOL]

    return sequence
def json_converter_dfd_orig():
    file_list=make_filelist(raw_doc_path_dfd_orig);
    for filepath in file_list:
        dfd_orig_one_doc_map={}
        #ラベルの分解処理
        label_list=(os.path.basename(filepath)).split('_')[:-1];
        #tokenized_documentはリスト型
        tokenized_document=tokenize(filepath);
        #オランダ語はわからんが,一応すべて小文字化はしておく
        tokenized_document=[t.lower() for t in tokenized_document];
        
        dfd_orig_one_doc_map['labels']=label_list;
        dfd_orig_one_doc_map['doc_str']=tokenized_document;
        
        print filepath
        with codecs.open(json_doc_path_dfd_orig+os.path.basename(filepath),'w','utf-8') as json_content:
            json.dump(dfd_orig_one_doc_map,json_content,ensure_ascii=False,indent=4);
Пример #48
0
    def train(self):
        for line in self:
            self.i += 1
        self.minimum = int(round(math.log(self.i, 10)))
        i = 0
        for line in self:
            tokens = tokenize(decode(line).lower())
            targets = self.get_targets(tokens)
            for gram in targets:
                self.posterior[gram] += 1
                self.grams_by_line[i].add(gram)
                for token in gram:
                    self.prior[token] += 1
            i += 1
#         print self.prior.most_common(10)
#         print self.posterior.most_common(10)
        self.crunch()
Пример #49
0
def calculate_tf(lang="", doc=""):
    """ Returns a map with all non-stopwords and its respective frequencies.

        Ex: {"work": 1, "going": 1}
    """
    tf_by_stem = {}

    # Cleaning document
    doc = re.sub(" +", " ", doc).lower()
    tokens = remove_stopwords(tokenize(doc), lang, min_len=3, max_len=30)

    stems = get_stems(tokens, lang)

    for stem in stems:
        tf_by_stem[stem] = tf_by_stem.get(stem, 0) + 1

    return tf_by_stem
Пример #50
0
def main():
  try:
    reader = csv.reader(open(args.files[0]))
    next(reader)                                        # skip header row
  except:
    print "Error: could not read ", args.files[0]

  try:
    writer = open(args.files[1], "w")
  except:
    print "Error: could not write to file", args.files[1]

  # stopwords = nltk.corpus.stopwords.words('english')
  POS_TAGS = ['CC','CD', 'DT','EX','FW','IN', 'JJ','JJR', 'LS', 'MD', 
  'NN','NNS', 'NNP','NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB','RBR', 'RBS', 'RP', 'TO', 
  'UH', 'VB', 'VBG', 'VBN','VBP', 'VBZ', 'WDT','WP', 'WP$', 'WRB']

  # First column (label) for each input line should be either 0 or 1 (train) or row number (test) 
  for line in reader:
    if line[0] < 0:             # Labels in test and train should be at least 0
      next
    else:        
      label = line[0]
      if label == "0":
        label =   "-1"

      lineout = label           # Initialize lineout to be the label
      if NAMESPACE == "C":      # No parts of speech
        tokens = tokenize(line[2])
        lineout = lineout + " |C " + (''.join( str(x) for x in tokens))

      elif NAMESPACE == "POS":  # Use each part of speech as a namespace for VW
        tokens = tokenize_pos(line[2])
        for key in tokens:
          if key not in POS_TAGS:
            next
          else:
            lineout = lineout + " |" + key  + " " + (''.join( str(x) for x in tokens[key])) + " "
      else:
        print "ERROR - only two namespace options defined"
        exit
      writer.write(lineout + "\n")
  writer.close()
Пример #51
0
def find_sentences_from_reviews(tag_review_dict):
	d = {}
	print "looking for sentences"
	stop_lst = set(get_stop_lst())
	tokenize=tokenizer2.tokenize
	stem = stemmer.stem
	for tag, reviews in tag_review_dict.iteritems():
		print "current tag: ", tag
		sentences = []
		for (idx, review) in reviews:
			lines = tokenizer.tokenize(review)
			for line in lines:
		                #tokens = nltk.wordpunct_tokenize(line)
		                tokens = tokenize(line)
		                clean_line = [stem(token) for token in tokens if token not in stop_lst and token not in punctuation and token.isalpha()]
				if tag in clean_line:
					sentences.append((idx,line))
		d[tag] = sentences
	return d
Пример #52
0
# <codecell>

nltk.download()

# <markdowncell>

# Density
# =======

# <codecell>

from nltk import word_tokenize as tokenize

# <codecell>

nltk.pos_tag(tokenize("The quick brown fox jumps over the lazy dog."))

# <codecell>

nltk.pos_tag(tokenize("If I were you I wouldn't do that with these."))

# <markdowncell>

# Create a density checker

# <codecell>

import re

matches = lambda x, re_parts: any([re.findall(y, x) for y in re_parts])
Пример #53
0
def get_tokenized_dialog_lines(iterable_dialog_lines):
    for line in iterable_dialog_lines:
        tokenized_dialog_line = tokenize(line)
        tokenized_dialog_line = [START_TOKEN] + tokenized_dialog_line + [EOS_SYMBOL]
        yield tokenized_dialog_line
Пример #54
0
		thisTweet['minute_utc'] = float(created_utc.minute)
		thisTweet['day_time_utc'] = thisTweet['day_of_week_utc'] + float(created_utc.hour) / 24.0
		created_est = created_utc.replace(tzinfo=pytz.utc).astimezone(localTz)
		thisTweet['date_est'] = str(created_est.date())
		thisTweet['day_of_week_est'] = created_est.weekday()
		thisTweet['weekend_est'] = thisTweet['day_of_week_est'] >= 5
		thisTweet['time_est'] = float(created_est.hour) + float(created_est.minute)/60
		thisTweet['minute_est'] = float(created_est.minute)
		thisTweet['day_time_est'] = thisTweet['day_of_week_est'] + float(created_est.hour) / 24.0
		# Org features.
		thisTweet['org'] = org
		thisTweet['org_category'] = orgData['category'].lower()
		thisTweet['social_flow_user'] = orgData['socialFlow']
		thisTweet['followers_count'] = twAccounts[org]['followers_count']
		# Sentiment features.
		thisTweet['word_count'] = len([t for t in tokenize(cleanMessage(tweet['text'])) if t not in skipTokens])
		sent = sentiment.get(str(tweet['id']))
		if sent:
			thisTweet['sentiment_class'] = sent['class']
			thisTweet['sentiment_score_positive'] = sent['meanScorePosSig']
			thisTweet['sentiment_score_negative'] = sent['meanScoreNegSig']
		else:
			thisTweet['sentiment_class'] = None
			thisTweet['sentiment_score_positive'] = None
			thisTweet['sentiment_score_negative'] = None

		# Tweet outcomes.
		thisTweet['favorites'] = tweet['favorite_count']
		thisTweet['retweets']  = tweet['retweet_count']

		# Bitly features. This is going to be tougher.
Пример #55
0
# coding: utf-8


import sys; print('Python %s on %s' % (sys.version, sys.platform))
sys.path.extend(['/home/rharriso/Code/Python/NLTKWorkspace'])

# Page 79 Natural language processing in python
from __future__ import division
import nltk, re, pprint
f = open('ASOIAF/A Clash of Kings A Song of Ice and Fire Book 2_nodrm.txt')
txt = f.read()
text = nltk.text(text)
text = nltk.text(nltk.tokenize(text))
text = nltk.text(nltk.word_tokenize(text))
nltk.word_tokenize(txt)
tokens = nltk.word_tokenize(txt)
text = nltk.Text(tokens)
text.concordance("Arya")
#
# Load a feed
#
import feedparser
import feedparser
llog = feedparser.parse("http://rharriso.github.io/feed.xml")
llog['feed']
llog['feed']['title']
len(llog.entries)
post = llog[2]
llog.entries[2]
post = llog.entries[2]
post.content