Пример #1
0
def vectorize_dataset(dataset, word2idx, memory_size, sentence_length):
    def word2idx_func(x):
        return word2idx.get(x, 0)

    def pad_2d_to(width, array):
        d1, d2 = abs(width[0] - array.shape[0]), abs(width[1] - array.shape[1])
        return np.pad(array, ((0, d1), (0, d2)), 'constant')

    def pad_1d_to(width, array):
        d = abs(width - array.shape[0])
        return np.pad(array, ((0, d)), 'constant')

    N = len(dataset)
    facts = np.zeros((N, memory_size, sentence_length))
    query = np.zeros((N, sentence_length))
    answer = np.zeros((N))
    for idx, (fcts, q, a) in enumerate(dataset):
        facts[idx] = pad_2d_to([memory_size, sentence_length],
                               np.vstack([
                                   pad_1d_to(
                                       sentence_length,
                                       np.fromiter(
                                           map(word2idx_func, tokenize(f)),
                                           np.int32)) for f in fcts
                               ])[-memory_size:])
        query[idx] = pad_1d_to(
            sentence_length,
            np.fromiter(map(word2idx_func, tokenize(q)), np.int32))
        answer[idx] = word2idx_func(a)
    return facts, query, answer
Пример #2
0
    def get_norm_words(string):
        if stem:
            return [
                stemmer.stem(word)
                for word in tokenizer.tokenize(string.lower())
                if not word in stop_words
            ]

        else:
            return tokenizer.tokenize(string)
Пример #3
0
def main(ignore_len=3, report_crs=False):
    for line in fileinput.input():
        src, trg, crs = diff2before_after(line.strip(), report_crs)

        before = tokenize(src)
        after = tokenize(trg)
        if len(before) > ignore_len and len(after) > ignore_len:
            print(src, file=sys.stderr)
            print(trg)
            if report_crs:
                with open("crs.txt", "w", encoding="utf-8") as outfile:
                    outfile.write(str(crs))
Пример #4
0
 def get_token_splitter(self, type="unigram"):
     """
     Returns a "tokenisation" function, but potentially also for bigrams,
     or for both unigrams and bigrams.
     """
     if type == "unigram":
         return lambda s: tokenize(s)
     elif type == "bigram":
         return bigram_splitter
     elif type == "both":
         # concatenation of both unigrams and bigrams
         return lambda s: tokenize(s) + bigram_splitter(s)
     else:
         return lambda s: []
Пример #5
0
def cleanSW(inputSen):

    tokens = tokenize(inputSen)
    clean_tokens = [
        x for x in tokens if not x in stopwords.words()
    ]  # clean all the words with not much meaning in the sentence like 'a', 'is'
    return clean_tokens
Пример #6
0
def create_vocabulary(directory):
    tokens = [
        token for f in os.listdir(directory)
        for token in tokenize(open(os.path.join(directory, f)).read())
        if not token.isdigit()
    ]
    return {v: k for k, v in enumerate(set(tokens), start=1)}
Пример #7
0
def main():
    NOVELS_DIR = 'Limpios'
    novelas = os.listdir(pathlib.Path(NOVELS_DIR))

    stemmer = SnowballStemmer("english")
    lemmer = WordNetLemmatizer()
    sw = stopwords.words('english')
    corpus = open('corpus_total.txt', 'w', encoding='utf8')
    improcesables = []
    for novela in novelas:
        print('procesando {}'.format(novela))
        novela_path = pathlib.Path(NOVELS_DIR, novela)

        try:
            titulo = novela_path.stem
            with open(novela_path) as f:
                libro = f.read()
                libro = libro.lower()
                libro = libro.strip().split()
                libro = ' '.join(libro)
                tokens = (w for w in tokenize(libro) if w not in sw)
                lemas = (lemmer.lemmatize(tok).lower() for tok in tokens
                         if tok.isalpha())
                tokenizada = ' '.join(lemma for lemma in lemas).strip()


#                tokenizada = ' '.join(x for x in novel_words(f, lemmer, sw))

            corpus.write('{} {}\n'.format(titulo, tokenizada))
        except UnicodeDecodeError:
            improcesables.append(novela)
    corpus.close()
Пример #8
0
 def readdataset(p, wdic, maxlen=100):
     dataret = []
     goldret = []
     toolong = 0
     realmaxlen = 0
     wdic[None] = masksym
     with open(p) as f:
         data = csv.reader(f, delimiter=",")
         for row in data:
             rowelems = tokenize(row[2])
             realmaxlen = max(realmaxlen, len(rowelems))
             if len(rowelems) > maxlen:
                 toolong += 1
             for rowelem in set(rowelems):
                 if rowelem not in wdic:
                     wdic[rowelem] = len(wdic)
             dataret.append([wdic[x] for x in rowelems])
             goldret.append(row[0])
     print("{} comments were too long".format(toolong))
     maxlen = min(maxlen, realmaxlen)
     datamat = np.ones((len(dataret) - 1, maxlen)).astype("int32") * masksym
     for i in range(1, len(dataret)):
         datamat[i - 1, :min(len(dataret[i]), maxlen
                             )] = dataret[i][:min(len(dataret[i]), maxlen)]
     return datamat, np.asarray(goldret[1:], dtype="int32"), wdic
Пример #9
0
def predict(name, command):
    command = command.lower()

    label_path = path.join(path.dirname(path.realpath(__file__)), "intents",
                           "config", "labels", "%s_labels.json" % name)
    with open(label_path, encoding="utf8") as f:
        labels = json.load(f)

    word_vocab = Vocabulary()
    word_vocab.load("%s_word_vocab.json" % name)

    #char embedding
    char_vocab = Vocabulary()
    char_vocab.load("%s_char_vocab.json" % name)

    idx2label = dict((idx, label) for idx, label in enumerate(labels))

    preprocessor = Preprocessor(word_vocab, None, char_vocab)
    model = BiLSTMCRF(labels, len(word_vocab), len(char_vocab))
    model.load_weights('intents/config/weights/%s.hdf5' % name)

    sentence = tokenize(command)
    features = preprocessor.transform([sentence])

    p = model.predict(features)
    predicted_labels = []
    for pred in p:
        predicted_labels.append(idx2label[pred])

    for word, label in zip(sentence, predicted_labels):
        print('%s: %s' % (word, label))
Пример #10
0
def embedding_matrix(raw_data, embs, fixed_len):
    '''
	Expects list of strings.
	Turns a list of linguistic units (sentences, texts,...) into a 3d tensor,
	such that each unit is represented by a matrix of concatenated embeddings
	of the words in this unit.
	
	'''
    matrices = []
    for sent in raw_data:
        sent = [str.lower() for str in tokenize(sent)]
        # print(sent)
        features = [embs.represent(str) for str in sent]
        # now pads the left margin
        zeros = np.zeros((fixed_len, embs.dim))
        i = 1
        while i <= fixed_len and i <= len(features):
            zeros[-i, :] = features[-i]
            i += 1

        # features=np.stack(features, axis=0)
        # # print(features)
        # # print(features.shape)
        # features=__pad_array__(features, fixed_len)
        # print(zeros)

        matrices.append(zeros)
    return np.stack(matrices, axis=0)
Пример #11
0
def clean_data(text, entry):
    sentences = texts
    # Converting all the text to lower cases
    text = sentences.lower()

    # Converting all negative contractions words
    for t in NEG_CONTRACTIONS:
        sentences = re.sub(t[0], t[1], text)

    # Converting the sentences into specific tokens
    tokens = tokenize(sentences)

    # converting some other contractions such as 'm as m
    tokens = [
        OTHER_CONTRACTIONS[token] if OTHER_CONTRACTIONS.get(token) else token
        for token in tokens
    ]

    ENGLISH_STOPWORDS = set(stopwords.words('english'))
    remove_punc = r'[a-z]+'
    # Removing all the punctuations from the tokens
    tokens = [word for word in tokens if re.search(remove_punc, word)]

    # Removing all the English Stop words
    tokens = [token for token in tokens if token not in ENGLISH_STOPWORDS]

    # Stemming the Tokens
    stemmer = nltk.PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Lemming the tokens
    wordnet_lemmatizer = WordNetLemmatizer()
    tokens = [wordnet_lemmatizer.lemmatize(word) for word in tokens]

    return tokens
Пример #12
0
 def features(self):
     """Trivially tokenized words."""
     words = tokenize(self.data.lower())
     puncs = [p for p in string.punctuation] + ['``', "''"]
     stops = stopwords.words('english')
     words = [w for w in words if w not in stops + puncs]
     return set(words)
Пример #13
0
def novel_words(novel, lemmer, sw):
    for line in novel:
        line = ' '.join(line.split()).strip()
        tokens = (w for w in tokenize(line) if w not in sw)
        nuevos = (lemmer.lemmatize(tok).lower() for tok in tokens
                  if tok.isalpha())
        yield ' '.join(nuevos)
async def process_sentence(doc: SentenceDocument):
    tokenized_sent = tokenize(doc.sentence)
    doc = TokenizedSentenceDocument(sent_tokens=[
        tokenized_sent,
    ],
                                    metadata='Single sentence')
    return process_tokenized_sentence_document(doc)
def perform_sentiment_analysis(txt):
    """
    This function is responsible for the sentiment analysis portion of the
    program. It first loads the saved and trained classification model into
    the program to be used. The function then uses the trained Naive-Bayes
    Classifier model to do Sentiment Analysis on 50 randomly selected Tweets
    out of the Tweets from the user's query in order to analyze the tone of
    these Tweets and make a prediction as to whether it is positive or
    negative. Each result is printed along with its related string to the.
    screen. Finally, the overall sentiment is determined by calculation the
    percent positive sentiment results over the 50 randomly selected Tweets.
    Arguments: text -- The tweet texts used for sentiment analysis by the model
    Returns: N/A
    """
    print("\n\t{}".format("----" * 16))
    print("\tSentiment Analysis:\n")
    c = pickle.load(open("classifier1.pickle", "rb"))
    print("\tUsing Naive-Bayes Classification Model to Analyze Tweets: \n")
    random.shuffle(txt)
    cl = [c.classify({x: True for x in clean(tokenize(p))}) for p in txt[:50]]
    [t.twt_print("\t({}) {}".format(a, p)) for a, p in zip(cl[:50], txt[:50])]
    pos_cnt = cl.count("+")
    neg_cnt = cl.count("-")
    sent = determine_sentiment(pos_cnt, neg_cnt)
    print("\n\tOverall Sentiment: {} ({}/{} +)".format(sent, pos_cnt, len(cl)))
    print("\n\t{}\n".format("----" * 16))
Пример #16
0
def stemming(inputSen):
    sentence = tokenize(inputSen)
    for x in sentence:
        inputSen = inputSen.replace(
            x,
            PorterStemmer().stem(
                x))  # stemming, remove suffixes e.g. playing and play
    return inputSen
Пример #17
0
def tokenizeDocs():

    spellIndex, spellTitles, spellTags, spells, spellSource, spellDescription = collectDocs(
    )

    gen_docs = [[w.lower() for w in tokenize(text)] for text in spellTags]

    return spellIndex, spellTitles, gen_docs, spells, spellSource, spellDescription
def create_keyword_regex(keyword):
    # import nltk
    ensure_package_path()
    from nltk.tokenize import wordpunct_tokenize as tokenize
    tokens = tokenize(keyword)
    pattern = '\\s+'.join(tokens)
    pattern = '\\b%s\\b' % pattern
    return re.compile(pattern, re.I | re.UNICODE)
Пример #19
0
def load_semeval_sents(filename):
    sents = []
    for review in Parser.parse(filename).getroot().findall('.//sentence'):
        sent = tokenize(review.find('text').text)
        sent = to_lower(sent)
        sent = filter_symbol(sent)
        sents.append(sent)
    return sents
Пример #20
0
def getTF(path):
    stops = stopwords.words('english')
    punctuations = [
        '(', ')', ';', ':', '[', ']', ',', '.', '!', '\"', '#', '$', '%', '&',
        '\'', '*', '+', '-', '/', '<', '=', '>', '?', '@', '\\', '^', '_', '`',
        '{', '|', '}', '~'
    ]
    remove_digits = str.maketrans('', '', digits)
    dic = {}
    tf = {}
    wordz = set()
    tokens = []
    try:
        text = textract.process(path).decode().translate(remove_digits)
    except Exception as identifier:
        print('--Err: FAILED TO PARSE ' + path + ' in normal mode, trying OCR')
        try:
            text = textract.process(
                path, method='tesseract').decode().translate(remove_digits)
        except Exception as identifier:
            print('--Err: FAILED TO PARSE ' + path +
                  ' even in OCR mode, skipping')
            return dict(), 1
        #return set({}), {}

    tempTokens = tokenize(text)

    if len(tempTokens) == 0:
        print('--Err: FAILED TO PARSE ' + path + ' in normal mode, trying OCR')
        try:
            text = textract.process(
                path, method='tesseract').decode().translate(remove_digits)
        except Exception as identifier:
            print('--Err: FAILED TO PARSE ' + path +
                  ' even in OCR mode, skipping')
            return dict(), 1

    for word in tempTokens:
        if word.lower() not in stops and word.lower(
        ) not in punctuations and len(word) > 1:
            tokens.append(word)

    for word in tokens:
        if dic.__contains__(word):
            dic[word] = dic[word] + 1
        else:
            dic[word] = 1

    counter1 = 0
    for key, value in sorted(dic.items(),
                             key=lambda item: item[1],
                             reverse=True):
        if counter1 >= 200:
            break
        tf[key] = value
        #wordz.add(key)
        counter1 += 1
    return tf, 0
Пример #21
0
def readAndDis():
    writeString = ""
    with open(
            'D:/SKOLE/MASTER 2016/testing/Testing database/100URL-target-context.txt',
            encoding='utf8') as fp:
        for line in fp:
            testLineArr = line.split('|')
            context = testLineArr[2]
            disWord = testLineArr[1]
            # print(context+":"+disWord)
            # context = "The British were the first to introduce armored vehicles, in 1916 -- the term tank was actually a code word intended to fool eavesdropping Germans into thinking they were discussing (inordinately deadly) water tanks. Even then, the Brits relied heavily on horses to move artillery and supplies, drafting more than a million of them to slog through the muddy trenches of Belgium and France."
            # print(pos(tokenize(context)))
            # print("The sentence : ", context, "\n########")
            regex = re.compile('[^a-zA-Z]')
            #First parameter is the replacement, second parameter is your input string
            regex.sub(' ', context)

            context1 = findNouns(context)
            #Print which word from the context that you want to disambiguate
            # print("write your word \n")

            wordPos = pos(tokenize(disWord))
            print(wordPos)
            if wordPos[0][1][0] == 'V':
                targetWordSynsets = wn.synsets(disWord, pos=wn.VERB)
            else:
                targetWordSynsets = wn.synsets(disWord, pos=wn.NOUN)

            if targetWordSynsets is None:
                return
            print(targetWordSynsets)
            # targetWordSynsets[0].pos

            #Run the program timer
            start = timeit.timeit()
            synsetHashValues = disambiguationAlgo(targetWordSynsets, context1)
            end = timeit.timeit()

            print("\n###\nTime used in algorithm : " + str(end - start) +
                  " seconds\n###\n")

            print("Based on the context :")
            # print("---".join(context1), "\n")

            writeString += "\n"
            if isinstance(synsetHashValues, dict):

                for key, value in sorted(synsetHashValues.items(),
                                         key=operator.itemgetter(1),
                                         reverse=True):
                    syns = key.name()
                    print(key.name() + " : " + str(value) + "\n")
                    writeString += key.name() + " : " + str(value) + "\n"
                writeString += "\n"
            else:
                writeString += synsetHashValues.name() + "\n"
    return writeString
Пример #22
0
 def segment(self, doc):
     raw_sentences = doc.split("\n")
     sentences = []
     for sentence in raw_sentences:
         # cur_sentence = sentence.split(". ")
         cur_sentence = nltk_segment(sentence)
         if len(cur_sentence) > 0: sentences += cur_sentence
     tokenized_sentences = [tokenize(sentence) for sentence in sentences]
     return (sentences, tokenized_sentences)
 def segment(self, doc):
     raw_sentences = doc.split("\n")
     sentences = []
     for sentence in raw_sentences:
         # cur_sentence = sentence.split(". ")
         cur_sentence = nltk_segment(sentence)
         if len(cur_sentence) > 0:
             sentences += cur_sentence
     tokenized_sentences = [tokenize(sentence) for sentence in sentences]
     return (sentences, tokenized_sentences)
Пример #24
0
def tokenizer(mode, lowercase=False):
    if mode == 'char':
        if lowercase:
            tokenizer = (lambda s: list(s.strip().lower()))
        else:
            tokenizer = (lambda s: list(s.strip()))
    elif (mode == 'space') or (mode == 'bpe'):
        if lowercase:
            tokenizer = (lambda s: s.lower().split())
        else:
            tokenizer = str.split
    elif mode == 'word':
        if lowercase:
            tokenizer = (lambda s: tokenize(s.lower()))
        else:
            tokenizer = (lambda s: tokenize(s))
    else:
        raise ValueError('Unknown tokenizer: "%s"' % mode)

    return tokenizer
Пример #25
0
            def single_score_fn(s):
                s = tokenize(s)
                count_male_pronouns = sum(s.count(p) for p in male_pronouns)
                count_female_pronouns = sum(s.count(p) for p in female_pronouns)

                if count_male_pronouns > count_female_pronouns:
                    return self.POSITIVE if self.config["scorer_attribute"] == "male" else self.NEGATIVE
                if count_female_pronouns > count_male_pronouns:
                    return self.POSITIVE if self.config["scorer_attribute"] == "female" else self.NEGATIVE
                else: ## equal
                    return self.POSITIVE if self.config["scorer_attribute"] == "other" else self.NEGATIVE
Пример #26
0
def findNouns(context):
    tokenized = tokenize(context)
    sentence = pos(tokenized)
    properNouns = [
        word for word, pos in sentence
        if pos == 'NN' or pos == 'NNS' or pos == 'NNP' or pos == 'NNPS'
    ]
    print("List over The nouns ")
    # print(properNouns)
    print("\n")
    return properNouns
Пример #27
0
def create_keyword_regex(keyword):
    print 'create_keyword_regex'
    # import nltk
    ensure_package_path()
    from nltk.tokenize import wordpunct_tokenize as tokenize
    print 'tokenize ==> %s' % (keyword)
    tokens = tokenize(keyword)
    pattern = '\\s+'.join(tokens)
    pattern = '\\b%s\\b' % pattern
    print 'compile pattern ==> %s' % (pattern)
    return re.compile(pattern, re.I | re.UNICODE)
Пример #28
0
def gensim_indexer(embeddings, doc, ignore=True):
    # type: (KeyedVectors, str, bool) -> Iterator[int]

    for word in tokenize(doc):
        try:
            yield embeddings.vocab[word.lower()].index
        except KeyError:
            if ignore:
                pass
            else:
                raise
Пример #29
0
def make_ngrams(corpus, **kwargs):
    n = kwargs.get('n', 1)
    ncursor = 1
    stem = kwargs.get('stem', True)
    ngrams = tokenize(corpus)
    if stem:
        ngrams = [stemmer.stem(ngram) for ngram in ngrams]
    ncursor += 1
    while ncursor <= n:
        ngrams += ngramify(ngrams, n)
        ncursor += 1
    return ngrams
    def encode_to_iob(self, sentence, entities):
        """
        Extract IOB labels for a given sentence

        Args:
            - sentence: list of tokens
            - entities: dict of slot => entity items
        
        Returns:
            list of IOB labels
        """
        # base data structures: list of labels (target)
        iob_labels = ['_O' for i in range(len(sentence))]

        # first step: get base labels from slot entities
        for slot in entities:
            for token in tokenize(entities[slot]):

                # TODO: if not in tokens?
                if token in sentence:
                    i = sentence.index(token)
                    iob_labels[i] = slot

        # redefine each label as either a beginning or inside tag

        # convert into list of tuples to make it easier to distinguish between single and sequence entities
        grouped_labels = []
        group = []
        prev_label = None
        for label in iob_labels:
            if label == prev_label:
                group.append(label)
            else:
                if prev_label:
                    grouped_labels.append(group)
                group = [label]
                prev_label = label
        if group:
            grouped_labels.append(group)

        # extract groups into one sequence
        labels = []
        for g in grouped_labels:
            if g[0] == '_O':
                labels += g
            elif len(g) == 1:
                labels.append(g[0] + '_I')
            else:
                labels.append(g[0] + '_B')
                labels += [x + '_I' for x in g[1:]]

        return labels
Пример #31
0
def bigram_splitter(text: str):
    """
    Returns list of bigrams in given text.
    """
    tokens = tokenize(text)
    if len(tokens) < 2:
        bigrams = []
    else:
        bigrams = [
            tokens[i] + " " + tokens[i + 1] for i in range(len(tokens) - 1)
        ]

    return bigrams
Пример #32
0
def analyser(documents):
    porter = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    modified_arr = [[porter.stem(i.lower()) for i in tokenize(d.translate(None, string.punctuation)) if i.lower() not in stop_words] for d in documents]
    modified_doc = [' '.join(i) for i in modified_arr]

    tf_idf = TfidfVectorizer().fit_transform(modified_doc)

    for i in xrange(len(documents) / 2):
        minimum = (1, None)
        for j in xrange(len(documents) / 2, len(documents)):
            minimum = min((cosine(tf_idf[i].todense(), tf_idf[j].todense()), j - len(documents) / 2), minimum)
        print minimum[1] + 1
Пример #33
0
def transfer_vec(txts, wv_model, padding=10, dim=300):
    """
    transfer
    :param txts: a list of txt
    :param wv_model:
    :return: list of vector
    """

    if type(txts) in (list, tuple):
        vec = []
        for sentence in txts:
            sen_vec = []
            for word in tokenize(sentence.lower()):
                try:
                    sen_vec.append(wv_model[word])
                except KeyError as e:
                    sen_vec.append(np.random.rand(300))
            while len(sen_vec) < padding:
                sen_vec.append(np.zeros(dim, dtype=np.float))
            if len(sen_vec) > padding:
                sen_vec = sen_vec[:padding]
            vec.append(np.array(sen_vec, dtype='f'))
        return vec
    elif type(txts) == str:
        sen_vec = []
        for word in tokenize(txts.lower()):
            try:
                sen_vec.append(wv_model[word])
            except KeyError as e:
                sen_vec.append(np.random.rand(300))
        while len(sen_vec) < padding:
            sen_vec.append(np.zeros(dim, dtype=np.float))
        if len(sen_vec) > padding:
            sen_vec = sen_vec[:padding]
        return [np.array(sen_vec, dtype='f')]
    else:
        raise TypeError('%s is not in support type' % str(type(txts)))
def computeSentiment(tweet_text):
    pos_count = 0
    neg_count = 0
    pos_terms = []
    neg_terms = []
    st = EnglishStemmer()

    tokenized_tweet = tokenize(tweet_text)
    for t in tokenized_tweet:
        #print st.stem(t.lower())
        if st.stem(t.lower()) in negative_terms:
            neg_terms.append(t.lower())
            neg_count += 1
        elif st.stem(t.lower()) in positive_terms:
            pos_terms.append(t.lower())
            pos_count += 1

    return pos_count, neg_count, set(pos_terms), set(neg_terms)
def computeSentiment(tweet_text):
    annotated = ''
    positive = 0
    negative = 0
    st = EnglishStemmer()

    tokenized_tweet = tokenize(tweet_text)
    for t in tokenized_tweet:
        #print st.stem(t.lower())
        wsp = ' '
        if len(annotated) == 0 or annotated[-1] in '@#':
            wsp = ''
        if st.stem(t.lower()) in negative_terms:
            annotated += wsp+'<span class="negative">'+t+'</span>' 
            negative += 1
        elif st.stem(t.lower()) in positive_terms:
            annotated += wsp+'<span class="positive">'+t+'</span>'
            positive += 1
        else:
            if len(t) == 1 and t not in '@#':
                annotated += t
            else: annotated += wsp + t

    return annotated, positive, negative
Пример #36
0
def extract_words(msg):
	ttl_words = set(tokenize(msg.replace('=\\n', '').lower()))
	final_words = [word for word in ttl_words if word not in stopword and len(word) >= 3]
	final_words = [word for word in final_words if word!="Subject:"]
    	return final_words
with open('sword.set', 'rb') as f:
    sword_list = load(f)

with open('phrase.set','rb') as f:
    phrase_list = load(f)

problem = lil_matrix((5000, 17173))
n = 0

for i, tfile in enumerate(train_files):
    if i < 2500:
        fdir = pos_dir
    else:
        fdir = neg_dir
    with open(fdir+tfile) as f:
        text = f.read()
        tokens = tokenize(text)
        fphrases = phrases(tokens)
    for token in tokens:
        if token in sword_list:
            ind = sword_list.index(token)
            problem[i, ind] = 1
    for p in fphrases:
        if p in phrase_list:
            ind = phrase_list.index(p) + 3111
            problem[i, ind] = 1

with open('problem.matrix', 'wb') as f:
    dump(problem, f)
Пример #38
0
 def tag(self, tense):
     """Does translation from tag generated by tagger into unified format
     
         Args:
             sentence: list of touple (word and its form) which are after verb
         Returns:
             list of touple (word and its form in unified format)
     """
     words = self.__utag.tag(tokenize(tense))
     
     for i, (word, form) in enumerate(words):
         word_info = {}
         
         if form[0] == 'V': word_info['klasa'] = 'czasownik'
         elif form[0] == 'S': word_info['klasa'] = 'rzeczownik'
         elif form[0] == 'A': word_info['klasa'] = 'przymiotnik'
         elif form[0] == 'N': word_info['klasa'] = 'liczebnik'
         elif form[0] == 'Z': word_info['klasa'] = 'zaimek'
         elif form[0] == 'D': word_info['klasa'] = 'przysłówek'
         elif form[0] == 'P': word_info['klasa'] = 'przyimek'
         elif form[0] == 'C': word_info['klasa'] = 'spójnik'
         elif form[0] == 'I': word_info['klasa'] = 'wykrzyknik'
         elif form[0] == 'T': word_info['klasa'] = 'partykuła'
         else: word_info['klasa'] = 'nieznany'
         
         if form[1] == 'S': word_info['liczba'] = 'pojedyńcza'
         elif form[1] == 'P': word_info['liczba'] = 'mnoga'
         
         if(len(form) >= 3):
             if form[2] == 'N': word_info['przypadek'] = 'mianownik'
             elif form[2] == 'G': word_info['przypadek'] = 'dopełniacz'
             elif form[2] == 'D': word_info['przypadek'] = 'celownik'
             elif form[2] == 'A': word_info['przypadek'] = 'biernik'
             elif form[2] == 'I': word_info['przypadek'] = 'narzędnik'
             elif form[2] == 'L': word_info['przypadek'] = 'miejscownik'
             elif form[2] == 'V': word_info['przypadek'] = 'wołacz'
         
         if(len(form) >= 4):
             if form[3] == 'M': word_info['rodzaj'] = 'm'
             elif form[3] == 'P': word_info['rodzaj'] = 'm'
             elif form[3] == 'A': word_info['rodzaj'] = 'm'
             elif form[3] == 'I': word_info['rodzaj'] = 'm'
             elif form[3] == 'F': word_info['rodzaj'] = 'ż'
             elif form[3] == 'N': word_info['rodzaj'] = 'n'
             elif form[3] == 'O': word_info['rodzaj'] = 'm'
             elif form[3] == 'R': word_info['rodzaj'] = 'ż'
             elif form[3] == 'T': word_info['rodzaj'] = 'ż'
         if(len(form) >= 6):
             if form[5] == '1': word_info['osoba'] = 'pierwsza'
             elif form[5] == '2': word_info['osoba'] = 'druga'
             elif form[5] == '3': word_info['osoba'] = 'trzecia'
             elif form[5] == 'I': word_info['osoba'] = 'bezokolicznik'
             elif form[5] == 'B': word_info['osoba'] = 'bezosobnik'
             elif form[5] == 'U': word_info['osoba'] = 'imiesłów'
             elif form[5] == 'W': word_info['osoba'] = 'imiesłów'
         if(len(form) >= 7):
             if form[6] == 'T': word_info['czas'] = 'teraźniejszy'
             elif form[6] == 'P': word_info['czas'] = 'przeszły'
             elif form[6] == 'F': word_info['czas'] = 'przyszły'
         if(len(form) >= 8):
             if form[7] == 'O': word_info['tryb'] = 'oznajmujący'
             elif form[7] == 'P': word_info['tryb'] = 'przypuszczający'
             elif form[7] == 'R': word_info['tryb'] = 'rozkazujący'
         if(len(form) >= 9):
             if form[8] == 'D': word_info['aspekt'] = 'dokonane'
             elif form[8] == 'N': word_info['aspekt'] = 'niedokonane'
         
         words[i] = (words[i][0], word_info)
     
     return words
Пример #39
0
    return c

defined_words = set()
freq = {}
total_tokens = 0
total_defs = 0
with codecs.open(data_filepath, 'r', 'utf-8') as ifp:
    for line in ifp:
        total_defs = total_defs + 1
        line = line.strip()
        parts = line.split('\t')
        if parts[0] not in freq:
            freq[parts[0]] = 0
        freq[parts[0]] = freq[parts[0]] + 1
        defined_words.add(parts[0])
        for t in tokenize(parts[3]):
            if t not in freq:
                freq[t] = 0
            freq[t] = freq[t] + 1
            total_tokens = total_tokens + 1

print('#word being defined: ' + str(len(defined_words)))
print('#definition: ' + str(total_defs))
print('#tokens: ' + str(total_tokens))
print('vocab size: ' + str(len(freq)))
print('rare word frequency: ')
print(' - 1: ' + str(num_words_with_freq(freq, 1)))
print(' - 2: ' + str(num_words_with_freq(freq, 2)))
print(' - 3: ' + str(num_words_with_freq(freq, 3)))
print(' - 4: ' + str(num_words_with_freq(freq, 4)))
print(' - 5: ' + str(num_words_with_freq(freq, 5)))
Пример #40
0
 def _tokenize(self, tense, isPl):
     if isPl:
         return self.tagger.tag(tense)
     else:
         return self.__utag.tag(tokenize(tense))
def tokenize_sentence(sentence):
    ######################################
    #   Splits article into sentences    #
    ######################################

    return ' '.join(list(tokenize(sentence)))