Exemplo n.º 1
0
    def validateResponse(self, response, queryTerms):
        from bs4 import BeautifulSoup

        if response.status_code == 200:
            soup = BeautifulSoup(response.text)
            from irlib.preprocessor import Preprocessor
            from irlib.matrix import Matrix
            from irlib.metrics import Metrics
            prep = Preprocessor()
            mx = Matrix()
            metric = Metrics()
            terms = prep.ngram_tokenizer(text=soup.get_text())
            mx.add_doc(doc_id=response.url,
                       doc_terms=terms,
                       frequency=True,
                       do_padding=True)
            '''for doc in mx.docs:
                distance = metric.euclid_vectors(doc['terms'], q_vector)
                print distance
            '''
            cnt = Counter()
            for word in terms:
                cnt[word] += 1
            tableTerms = PrettyTable(["Term", "Frequency"])
            for word in sorted(cnt, key=cnt.get, reverse=True):
                if word.encode('ascii').lower() in queryTerms.encode(
                        'ascii').lower().split():
                    tableTerms.add_row([word, cnt[word]])
            print tableTerms
        else:
            print "[-] Response for %s is %s " % (response.url,
                                                  response.status_code)
Exemplo n.º 2
0
    def __validateResponse(self, response, queryTerms):
        from bs4 import BeautifulSoup

        if response.status_code == 200:
            soup = BeautifulSoup(response.text)
            from irlib.preprocessor import Preprocessor
            from irlib.matrix import Matrix
            from irlib.metrics import Metrics
            prep = Preprocessor()
            mx = Matrix()
            metric = Metrics()
            terms = prep.ngram_tokenizer(text=soup.get_text())
            mx.add_doc(doc_id=response.url,
                       doc_terms=terms,
                       frequency=True,
                       do_padding=True)
            cnt = Counter()

            for word in terms:
                cnt[word] += 1
            table = Texttable()
            table.set_cols_align(["l", "l"])
            table.set_cols_valign(["m", "m"])
            table.set_cols_width([40, 55])

            rows = [["Term", "Frequency"]]
            for word in sorted(cnt, key=cnt.get, reverse=True):
                if word.lower() in queryTerms.lower().split():
                    rows.append([word, cnt[word]])
            table.add_rows(rows)
            print table.draw() + "\n"

        else:
            print "[-] Response for %s is %s " % (response.url,
                                                  response.status_code)
Exemplo n.º 3
0
 def test_stemmer_lower(self):
     p = Preprocessor(lower=True, stem=True)
     stemmed = p.stemmer('Running')
     if my_nltk:
         self.assertEqual(stemmed, 'run')
     else:
         self.assertTrue(False, 'NLTK is not installed')
Exemplo n.º 4
0
def main():
    # Load configuration from file
    config = Configuration(config_file='/home/huma/Downloads/irlib-0.1.1/irlib/classify.conf')
    try:
        config.load_configuration()
        config_data = config.get_configuration()
    except:
        print("Error loading configuration file.")
        print("Classifier aborting.")
        raise

    # config.display_configuration()
    print(config)

    # sys.exit()

    myfolds = config.get_folds()
    correctness = 0

    # Preporcessor: tokenizer, stemmer, etc.
    prep_lower = config_data['lower']
    prep_stem = config_data['stem']
    prep_pos = config_data['pos']
    prep_ngram = config_data['ngram']
    prep = Preprocessor(pattern='\W+', lower=prep_lower, stem=prep_stem, pos=prep_pos, ngram=prep_ngram)

    for myfold in myfolds:
        ev = Evaluation(config=config, fold=myfold)
        if config_data['classifier'] == 'rocchio':
            ml = Rocchio(verbose=VERBOSE, fold=myfold, config=config, ev=ev)
        elif config_data['classifier'] == 'knn':
            ml = KNN(verbose=VERBOSE, fold=myfold, config=config, ev=ev)
        else:
            ml = NaiveBayes(verbose=VERBOSE, fold=myfold, config=config, ev=ev)
        training(config, myfold, ml, prep)
        ml.do_padding()
        ml.calculate_training_data()
        # r.display_idx()
        ml.diagnose()
        testing(config, myfold, ml, ev, prep)

        k = config_data['k']
        results = ev.calculate(review_spam=True, k=k)
        print('Accuracy for fold %d: %s' % (myfold, results))

        correctness += results

    print("Average accuracy for all folds:", correctness / len(myfolds))
Exemplo n.º 5
0
def readfiles(fold_path='all-folds/fold1/'):

    prep = Preprocessor()
    mx = Matrix()

    files = os.listdir(fold_path)
    for filename in files:
        fd = open('%s/%s' % (fold_path, filename), 'r')
        file_data = fd.read()
        terms = prep.ngram_tokenizer(text=file_data)
        mx.add_doc(doc_id=filename,
                   doc_terms=terms,
                   frequency=True,
                   do_padding=True)

    print 'Number of read documents:', len(mx.docs)
    print 'Number of read terms', len(mx.terms)
    #print mx.terms[0:5], mx.terms[-5:-1]
    print mx.terms
    print mx.docs
Exemplo n.º 6
0
# Load the three modules:
from irlib.preprocessor import Preprocessor
from irlib.matrix import Matrix
from irlib.metrics import Metrics
import difflib

# Create instances for their classes:
prep = Preprocessor()
mx = Matrix()
metric = Metrics()
q_vector = []


def generateMatrix():
    fd = open('./content_transfer_data/roto-sent-data.train-ONLYTAGS.src', 'r')
    count = 1
    for line in fd.readlines():
        terms = line.split(' ')
        terms = [x.strip() for x in terms]
        mx.add_doc(doc_id=str(count),
                   doc_terms=terms,
                   frequency=True,
                   do_padding=True)

        count += 1
        if count % 1000 == 0:
            print count

    mx.dump('IRmatrix.train.src.csv', delimiter='\t', header=True)

Exemplo n.º 7
0
 def __init__(self):
     self._mx = Matrix()
     self._prep = Preprocessor(pattern='\W+', lower=True, stem=True)
Exemplo n.º 8
0
 def test_term2ch(self):
     p = Preprocessor()
     charlist = p.term2ch('help')
     self.assertEqual(charlist, ['h', 'e', 'l', 'p'])
Exemplo n.º 9
0
 def test_3gram_tokenizer(self):
     p = Preprocessor(lower=False, stem=False, ngram=3)
     returned_tokens = p.ngram_tokenizer('how do you do?')
     expected_tokens = ['how do you', 'do you do']
     self.assertEqual(returned_tokens, expected_tokens)
Exemplo n.º 10
0
 def test_tokenizer_lower(self):
     p = Preprocessor(lower=True, stem=False)
     tokens = p.tokenizer('This is IRLib')
     self.assertEqual(tokens, ['this', 'is', 'irlib'])