Exemplo n.º 1
0
def run_advanced_search(query):
    results = []
    query_lemmas = lemmatize(query)
    for article in ARTICLES:
        article_lemmas = lemmatize(article['title'] + '\n' + article['body'])
        for lemma in article_lemmas:
            if lemma in query_lemmas:
                results.append(article)
                break
    return results
Exemplo n.º 2
0
def add_to_matrix(user_id, text):
    global a
    global w
    global user2index
    global word2index
    tokens = lemmatize(text)
    # matrix authors * words
    if not user_id in user2index:
        user2index[user_id] = len(user2index)
        a = np.insert(a, a.shape[0], 0, axis=0)
        print("Added a row: %s" % repr(a.shape))

    # global dict
    for token in tokens:
        if token in dict_of_words:
            dict_of_words[token] += 1
        else:
            dict_of_words[token] = 1
        if token in word2index:
            a[user2index[user_id]][word2index[token]] += 1
        else:
            a = np.insert(a, a.shape[1], 0, axis=1)
            word2index[token] = w
            a[user2index[user_id]][w] += 1
            w += 1
Exemplo n.º 3
0
def main():
    data = load_file(inputfile)

    for (k, v) in data.items():
        lemmalist = lemmatizer.lemmatize(v)
        data[k] = lemmalist

    print('--LEMMATIZATION FINISHED--', file=sys.stderr)
    save_file(data, outputfile)
Exemplo n.º 4
0
def tag_lemmatize_sentence(sentence):
    # Write tokens to input file
    f_input = open(tmp_input_path, 'w')
    for token in tokenize(sentence):
        f_input.write(token + '\t_\n')
    f_input.close()
    # Tag tokens
    tagger.tag(b_tmp_input_path, b_tmp_output_path)
    # Iterate through tagged tokens
    f_output = open(tmp_output_path)
    for line in f_output:
        line = line.rstrip('\n')
        if line == '':
            continue
        token, tag = line.split('\t')
        lemma = lemmatize(token, tag)
        yield (token, tag, lemma)
    f_output.close()
Exemplo n.º 5
0
def main():
    data = load_file(inputfile)

    for i in range(0, len(data['data'])):
        for j in range(0, len(data['data'][i]['paragraphs'])):
            for k in range(0, len(data['data'][i]['paragraphs'][j]['qas'])):
                for l in range(
                        0,
                        len(data['data'][i]['paragraphs'][j]['qas'][k]
                            ['answers'])):
                    answer = data['data'][i]['paragraphs'][j]['qas'][k][
                        'answers'][l]['text']
                    lemmalist = lemmatizer.lemmatize(answer)
                    data['data'][i]['paragraphs'][j]['qas'][k]['answers'][l][
                        'text'] = lemmalist

    print('--LEMMATIZATION FINISHED--')
    save_file(data, outputfile)
Exemplo n.º 6
0
def selection(freq, cards):
    used = set()
    choice = []

    # First pass through the cards
    for card in cards:
        ok = False
        for word in getWords(card.original):
            # If the card has a form that was not seen before, take it
            if getFreqId(lemmatize(word)) == freq and word not in used:
                used.add(word)
                ok = True
        if ok:
            choice.append(card)

    # While there are too few cards, try and add some
    while len(choice) < min(3, len(cards)):
        for card in cards:
            if card not in choice:
                choice.append(card)
                break

    return choice
Exemplo n.º 7
0
while webpage != "EXIT":
    webpage = input("Enter a webpage: ")

    try:
        if webpage.lower() == "save":
            filename = input("Enter file name:")
            ml_trainer.save(filename)
            continue
        # elif webpage.lower() == "append":
        #     ml_trainer.add_to_model(lemmatext, labels)
        #     lemmatext.clear()
        #     labels.clear()
        #     continue

        html = urllib.request.urlopen(webpage)
        processed_text = lemmatizer.lemmatize(lemmatizer.text_from_html(html))
        # input("Press Enter to continue")
        arr = []
        arr.append(processed_text)
        vector = ml_trainer.vectorize(arr)
        # conf = 5
        # prediction = ml_trainer.predict(vector)[0]
        prediction = ml_trainer.predict(vector)[0]
        conf = ml_trainer.get_probability(vector)[0][prediction]

        # ml_trainer.add_to_model(vector, [prediction])

        # ml_trainer.add_to_model(vector, ml_trainer.predict(vector))
        # conf = 0
        if prediction != 1:
            print("The input URL was not a breach report, " + str(conf * 100) +
Exemplo n.º 8
0
import lemmatizer

text = 'Конфіскація має стосуватися винятково чиновників (як місцевого, так і державного рівнів). Звичайний найнятий працівник, бізнесмен, пенсіонер або безробітний може спати спокійно. Але і для чиновників також є свої обмеження. Щоб потрапити під статтю, потрібно мати рахунок у банку, автомобіль, нерухомість, коштовності на суму, яка значно перевищує офіційний дохід.'

print(lemmatizer.lemmatize(text))
print >> sys.stderr, "Writing to %s" % `outfilenames`

for f in filenames: assert os.path.exists(f)
for f in outfilenames:
    if os.path.exists(f):
        print >> sys.stderr, "Warning, going to overwrite %s" % f

#print "Sleeping for 10 seconds..."
#import time
#time.sleep(10)

inf = [open(f) for f in filenames]
outf = [open(f, "wt") for f in outfilenames]

tot = 0
cnt = 0
for lines in izip(*inf):
    tot += 1
    keep = False
    for w in string.split(lines[0]):
        if lemmatize("en", w) in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
            keep = True
            break
    if keep:
        cnt += 1
        for l, f in izip(lines, outf):
            f.write(l)
    if tot % 10000 == 0:
        print >> sys.stderr, "%s lines kept" % percent(cnt, tot)
        print >> sys.stderr, stats()
Exemplo n.º 10
0
 def makeBold(self):
     self.bold = self.original
     for word in getRawWords(self.original):
         if getFreqId(lemmatize(word)) == self.freq:
             self.bold = boldWord(self.bold, word)
Exemplo n.º 11
0
allF = "all-sentences.txt"
tenF = "only-10.txt"
thousandF = "only-1000.txt"
with open(allF) as f:
    # Read the original sentences
    cards = []
    cnt = Counter()
    byNGram = {}
    sentences = []  # just for debugging purposes
    removed = 0
    for (num, line) in enumerate(f):
        audio, original, translation = line.split("\t")[:3]
        sentences.append(original)

        # Keep track of the occurrences of each lexeme
        cnt.update([lemmatize(word) for word in getWords(original)])

        # Get rid of overly similar sentences
        lemmas = [lemmatizeZealous(word) for word in getWords(original)]
        ok = True
        for i in range(len(lemmas) - ngram + 1):
            gram = tuple(sorted(lemmas[i:i + ngram]))
            # The sentence is deleted if it has an n-gram of lexemes which is
            # equal (up to permutation) to another sentence
            if gram in byNGram and byNGram[gram] != num:
                ok = False
                #print("Too similar:")
                #print(sentences[byNGram[gram]])
                #print(original)
                break
            else:
Exemplo n.º 12
0
for f in filenames:
    assert os.path.exists(f)
for f in outfilenames:
    if os.path.exists(f):
        print >> sys.stderr, "Warning, going to overwrite %s" % f

#print "Sleeping for 10 seconds..."
#import time
#time.sleep(10)

inf = [open(f) for f in filenames]
outf = [open(f, "wt") for f in outfilenames]

tot = 0
cnt = 0
for lines in izip(*inf):
    tot += 1
    keep = False
    for w in string.split(lines[0]):
        if lemmatize("en", w) in HYPERPARAMETERS["W2W FOCUS LEMMAS"]:
            keep = True
            break
    if keep:
        cnt += 1
        for l, f in izip(lines, outf):
            f.write(l)
    if tot % 10000 == 0:
        print >> sys.stderr, "%s lines kept" % percent(cnt, tot)
        print >> sys.stderr, stats()