示例#1
0
def getBadOfWords(sentence):
    metapy.log_to_stderr()

    doc = metapy.index.Document()
    # doc.content("I said that I can't believe that it only costs $19.95!")
    doc.content(sentence)

    tok = metapy.analyzers.ICUTokenizer(suppress_tags=True)
    tok.set_content(doc.content())  # this could be any string

    # Here, we can see that the LengthFilter is consuming our original ICUTokenizer.
    # It modifies the token stream by only emitting tokens
    # that are of a minimum length of 2 and a maximum length of 30.

    tok = metapy.analyzers.LengthFilter(tok, min=2, max=30)
    tok.set_content(doc.content())  # this could be any string

    # Stopword removal and stemming
    tok = metapy.analyzers.ListFilter(tok, "lemur-stopwords.txt",
                                      metapy.analyzers.ListFilter.Type.Reject)
    tok.set_content(doc.content())
    tokens = [token for token in tok]

    cleanSentence = ""
    for word in tokens:
        cleanSentence += word
        cleanSentence += " "
    return cleanSentence
示例#2
0
    def test_upload_submission(self):
        metapy.log_to_stderr()

        """
        This is the unit test that actually submits the results to the
        leaderboard. If there is an error (on either end of the submission),
        the unit test is failed, and the failure string is also reproduced
        on the leaderboard.
        """
        req = {
            'token': os.environ.get('GITLAB_API_TOKEN'),
            'alias': os.environ.get('COMPETITION_ALIAS') or 'Anonymous',
            'results': []
        }

        for cfg_file in self.cfgs:
            res = {'error': None}
            with open(cfg_file, 'r') as fin:
                cfg_d = pytoml.load(fin)
            res['dataset'] = cfg_d['dataset']
            print("\nRunning on {}...".format(res['dataset']))
            timeout_len = cfg_d['timeout']

            try:
                with Timeout(timeout_len):
                    res['results'] = self.get_results(cfg_file)
            except Timeout.Timeout:
                error_msg = "Timeout error: {}s".format(timeout_len)
                res['error'] = error_msg
                res['results'] = []

            req['results'].append(res)

        response = requests.post(self.submission_url, json=req)
        jdata = response.json()
        print(jdata)
        self.assertTrue(jdata['submission_success'])
示例#3
0
import metapy

metapy.log_to_stderr()

class TextProcessor:

    keywords = {'About', 'WorkExperience', 'Education', 'Certificates', 'Awards', 'Groups'}
    document = None
    tokenizer = None
    top_N = 5
    frequency_map = dict()
    sentence_token_map = dict()

    def __init__(self, top_n=5):
        self.document = metapy.index.Document()
        self.tokenizer = metapy.analyzers.ICUTokenizer(suppress_tags=True)
        self.tokenizer = metapy.analyzers.LengthFilter(self.tokenizer, min=2, max=30)
        self.tokenizer = metapy.analyzers.ListFilter(self.tokenizer, \
                                                "data/lemur-stopwords.txt",\
                                                metapy.analyzers.ListFilter.Type.Reject)
        self.top_N = top_n

    def split_sentences(self, text):
        sentences = []
        if (text != None and text != ''):
            for line in text.split('\n'):
                if ('. ') in line and 'ltd. ' not in line.lower():
                    pieces = line.split('. ')
                    for piece in pieces:
                        sentences.append(piece)
                else: