Пример #1
0
class TestDates(unittest.TestCase):

    def setUp(self):
        self.parser = CommonRegex()

    def test_numeric(self):
        formats = ["1-19-14", "1.19.14", "1.19.14", "01.19.14"]
        for f in formats:
            self.assertEqual(self.parser.dates(f), [f])

    def test_verbose(self):
        formats = ["January 19th, 2014", "Jan. 19th, 2014", "Jan 19 2014", "19 Jan 2014"]
        for f in formats:
            self.assertEqual(self.parser.dates(f), [f])
Пример #2
0
class PiiAnalyzer(object):
    def __init__(self, filepath):
        self.filepath = filepath
        self.parser = CommonRegex()
        self.standford_ner = StanfordNERTagger(
            'classifiers/english.conll.4class.distsim.crf.ser.gz')

    def analysis(self):
        people = []
        organizations = []
        locations = []
        emails = []
        phone_numbers = []
        street_addresses = []
        credit_cards = []
        ips = []
        data = []

        with open(self.filepath, 'rU') as filedata:
            reader = csv.reader(filedata)

            for row in reader:
                data.extend(row)
                for text in row:
                    emails.extend(self.parser.emails(text))
                    phone_numbers.extend(
                        self.parser.phones("".join(text.split())))
                    street_addresses.extend(self.parser.street_addresses(text))
                    credit_cards.extend(self.parser.credit_cards(text))
                    ips.extend(self.parser.ips(text))

        for title, tag in self.standford_ner.tag(set(data)):
            if tag == 'PERSON':
                people.append(title)
            if tag == 'LOCATION':
                locations.append(title)
            if tag == 'ORGANIZATION':
                organizations.append(title)

        return {
            'people': people,
            'locations': locations,
            'organizations': organizations,
            'emails': emails,
            'phone_numbers': phone_numbers,
            'street_addresses': street_addresses,
            'credit_cards': credit_cards,
            'ips': ips
        }
Пример #3
0
def addresses(data):
    st_address = []
    loc = ''
    loc_list = []
    parsed_text = CommonRegex(data)
    st_address = parsed_text.street_addresses
    words = nltk.word_tokenize(data)
    taggsets = nltk.pos_tag(words)
    namedEnt = nltk.ne_chunk(taggsets, binary=False)

    for subtree in namedEnt.subtrees():
        if subtree.label() == 'GPE':
            l = []
            # print((subtree.leaves()))
            for leaf in subtree.leaves():
                l.append(leaf[0])
            loc = ' '.join(l)
            if loc not in loc_list:
                loc_list.append(loc)

    loc_list.extend(st_address)

    #    for add in loc_list:
    #        data = data.replace(add, block)

    return data, loc_list
def check_for_address_format(addresses):
    final = []
    for item in addresses:
        parsed = CommonRegex(str(item))
        if parsed.street_addresses:
            final.append(parsed)
    return len(final)
Пример #5
0
def dates(data):
    parsed_text = CommonRegex(data)
    dates_list = parsed_text.dates

    #    for date in dates_list:
    #        data = data.replace(date, block)
    return data, dates_list
Пример #6
0
class TestTimes(unittest.TestCase):

    def setUp(self):
        self.parser = CommonRegex()

    def test_times(self):
        formats = ["09:45", "9:45", "23:45", "9:00am", "9am", "9:00 A.M.", "9:00 pm"]
        for f in formats:
            self.assertEqual(self.parser.times(f), [f])
Пример #7
0
def getFields(filename):
    filenamer = filename
    text = open(filenamer).read()
    text = text.upper()
    #print text
    #parser.parse(text)
    #print Parser.contacttest_address =
    parsed_text = CommonRegex(tex)
    print addresses
Пример #8
0
def index():
    if request.method == 'GET':
        return 'OK'
    elif request.method == 'POST':
        # Store the IP address of the requester
        request_ip = ipaddress.ip_address(u'{0}'.format(request.remote_addr))

        # If GHE_ADDRESS is specified, use it as the hook_blocks.
        if os.environ.get('GHE_ADDRESS', None):
            hook_blocks = [os.environ.get('GHE_ADDRESS')]
        # Otherwise get the hook address blocks from the API.
        else:
            hook_blocks = requests.get(
                'https://api.github.com/meta').json()['hooks']

        if request.headers.get('X-GitHub-Event') == "ping":
            return json.dumps({'msg': 'Hi!'})

        if request.headers.get('X-GitHub-Event') == 'pull_request':
            merge_state = request.json['pull_request']['state']
            merge_body = request.json['pull_request']['body']
            if (merge_state == 'closed'):
                print('Merge state closed')
                print('Merge Body: ' + merge_body)
                parsed_bounty_issue = re.findall(r"#(\w+)", merge_body)[0]

                repository_path_encode = str(repository_path)
                repository_path_encode = repository_path.encode('utf-8')

                bounty_issue_encode = str(parsed_bounty_issue)
                bounty_issue_encode = bounty_issue_encode.encode('utf-8')
                passphrase = hashlib.sha256(repository_path +
                                            issue_title).hexdigest()
                addresses = CommonRegex(merge_body).btc_addresses[0]
                parsed_bounty_issue = re.findall(r"#(\w+)", merge_body)
                bounty_address = github.get_address_from_issue(
                    parsed_bounty_issue)
                amount = utils.get_address_balance(bounty_address)
                with open(DEFAULT_WALLET_PATH, 'r') as f:
                    json_data = json.load(f)
                issue_name = json_data[parsed_bounty_issue]
                multisig_wallet.send_bitcoin(str(issue_name), str(addresses),
                                             int(amount * 1e8),
                                             str(passphrase))
                return json.dumps({'message': 'Pull request received'})
            return json.dumps({'message': 'Pull request payout failed'})

        if request.headers.get('X-GitHub-Event') == 'issue_comment':
            comment_data = {
                'url': request.json['comment']['issue_url'],
                'payout_address': request.json['issue']['labels'][0]['name'],
                'payout_amount': request.json['issue']['labels'][1]['name'],
                'body': request.json['comment']['body']
            }
            print(comment_data)
            return json.dumps({'message': 'Issue comment received'})
Пример #9
0
class TestLinks(unittest.TestCase):

    def setUp(self):
        self.parser = CommonRegex()

    def test_links(self):
        formats = ["www.google.com", "http://www.google.com", "www.google.com/?query=dog"
                   "sub.example.com", "http://www.google.com/%&#/?q=dog"]
        for f in formats:
            self.assertEqual(self.parser.links(f), [f])
Пример #10
0
def contains_personal_info(text):
    parsed_text = CommonRegex(text)
    return any([
        bool(parsed_text.links),
        bool(parsed_text.emails),
        bool(parsed_text.ips),
        bool(parsed_text.ipv6s),
        bool(parsed_text.credit_cards),
        bool(parsed_text.btc_addresses),
    ])
Пример #11
0
class TestPhones(unittest.TestCase):

    def setUp(self):
        self.parser = CommonRegex()

    def test_phones(self):
        formats = ["12345678900", "1234567890", "1 234 567 8900", "234-567-8900",
                   "1-234-567-8900", "1.234.567.8900", "5678900", "567-8900"]
        for f in formats:
            self.assertEqual(self.parser.phones(f), [f])
Пример #12
0
def extract_email(values):
    emaillist = []
    perm_email = []
    for item in values:
        Email = CommonRegex(str(item))
        if Email.emails:
            emaillist.append(Email.emails)
    for i in emaillist:
        for j in i:
            perm_email.append(j)
    return perm_email
Пример #13
0
def extract_phonenumbers(values):
    phonelist = []
    perm_phone = []
    for item in values:
        phonenumbers = CommonRegex(str(item))
        if phonenumbers.phones:
            phonelist.append(phonenumbers.phones)
    for i in phonelist:
        for j in i:
            perm_phone.append(j)
    return perm_phone
Пример #14
0
class PiiAnalyzer(object):
    def __init__(self, filepath):
        self.filepath = filepath
        self.parser = CommonRegex()
        self.standford_ner = StanfordNERTagger('classifiers/english.conll.4class.distsim.crf.ser.gz')

    def analysis(self):
        people = []
        organizations = []
        locations = []
        emails = []
        phone_numbers = []
        street_addresses = []
        credit_cards = []
        ips = []
        data = []

        with open(self.filepath, 'rU') as filedata:
            reader = csv.reader(filedata)

            for row in reader:
                data.extend(row)
                for text in row:
                    emails.extend(self.parser.emails(text))
                    phone_numbers.extend(self.parser.phones("".join(text.split())))
                    street_addresses.extend(self.parser.street_addresses(text))
                    credit_cards.extend(self.parser.credit_cards(text))
                    ips.extend(self.parser.ips(text))

        for title, tag in self.standford_ner.tag(set(data)):
            if tag == 'PERSON':
                people.append(title)
            if tag == 'LOCATION':
                locations.append(title)
            if tag == 'ORGANIZATION':
                organizations.append(title)

        return {'people': people, 'locations': locations, 'organizations': organizations,
                'emails': emails, 'phone_numbers': phone_numbers, 'street_addresses': street_addresses,
                'credit_cards': credit_cards, 'ips': ips
                }
Пример #15
0
def find_numbers(data):
    numbers = []
    data1 = CommonRegex(data)
    if data1.phones != 0:
        for n in data1.phones:
            numbers.append(n)
    size = len(numbers)

    stats = (
        "The number of unique phone numbers replaced in the given file is %d \n"
        % size)

    return numbers, stats
Пример #16
0
def find_address(data):
    addresses = []
    data1 = CommonRegex(data)
    if data1.street_addresses != 0:
        for n in data1.street_addresses:
            addresses.append(n)
    size = len(addresses)

    stats = (
        "The number of unique addresses replaced in the given file is %d \n" %
        size)

    return addresses, stats
Пример #17
0
    def detect(self, column: CatColumn, datum: str) -> Optional[PiiType]:
        """Scan the text and return an array of PiiTypes that are found"""
        regex_result = CommonRegex(datum)

        if regex_result.phones:  # pylint: disable=no-member
            return Phone()
        if regex_result.emails:  # pylint: disable=no-member
            return Email()
        if regex_result.credit_cards:  # pylint: disable=no-member
            return CreditCard()
        if regex_result.street_addresses:  # pylint: disable=no-member
            return Address()

        return None
Пример #18
0
def find_dates(data):
    data1 = CommonRegex(data)
    date5 = []
    if data1.dates != 0:
        for n in data1.dates:
            date5.append(n)
    for n in date5:
        data = data.replace(n, "█" * len(n))
    size = len(date5)

    stats = ("The number of unique dates replaced in the given file is %d \n" %
             size)

    return date5, stats
Пример #19
0
    def scan(self, text):
        """Scan the text and return an array of PiiTypes that are found"""
        regex_result = CommonRegex(text)

        types = []
        if regex_result.phones:  # pylint: disable=no-member
            types.append(PiiTypes.PHONE)
        if regex_result.emails:  # pylint: disable=no-member
            types.append(PiiTypes.EMAIL)
        if regex_result.credit_cards:  # pylint: disable=no-member
            types.append(PiiTypes.CREDIT_CARD)
        if regex_result.street_addresses:  # pylint: disable=no-member
            types.append(PiiTypes.ADDRESS)

        return types
Пример #20
0
    def get_address_from_issue(issue_number):

        github_url = "https://api.github.com/repos/" + repository['path'] + '/issues/' + issue_number
        headers = { "Authorization": "token " + GITHUB_TOKEN, "Content-Type": "application/json" }

        # Setup the request
        req = urllib.request.Request(github_url, headers=headers)

        # Make the request, capture the response
        res = urllib.request.urlopen(req).read()
        res = json.loads(res.decode())

        body = res['body']
        address = CommonRegex(body).btc_addresses[0]

        return address
Пример #21
0
def find_info(text, attribute):
    if text is None:
        return []
    parsed_text = CommonRegex(text)
    if attribute == Information.link:
        return parsed_text.links
    if attribute == Information.email:
        return parsed_text.emails
    if attribute == Information.time:
        return parsed_text.times
    if attribute == Information.date:
        return parsed_text.dates
    if attribute == Information.phone:
        return parsed_text.phones
    if attribute == Information.price:
        return parsed_text.prices
    if attribute == Information.address:
        return parsed_text.street_addresses
Пример #22
0
def dates(data):
    text_parsed = CommonRegex(data)
    list_of_dates = text_parsed.dates
    #alternately, if i use regular expression, to find dates.The regular expressions can be de#defined as we want.
    """ 
    init_file=[]
    total_data=data
    dates_red=[]
    for i in range(len(total_data)):
        red = total_data[i]
        red_dates = re.findall(r"([A-Z]\w\w\w+\s\d+,\s\d\d\d\d)", red)
        dates_red.append(red_dates)
        for i in range(len(dates_red)):
            for j in dates_red:
                red = red.replace(j,'██')
        init_file.append(red)
    return init_file
    """
    return data, list_of_dates
Пример #23
0
def is_stop_sentence(sentence):
    words = {
        'copyright', 'unsubscribe', 'instagram', 'twitter', 'facebook',
        'youtube', 'style='
    }

    tokens = nltk.word_tokenize(sentence)
    for token in tokens:
        if token.lower() in words:
            return True

    parsed_text = CommonRegex(sentence)
    if len(parsed_text.street_addresses) > 0 or len(
            parsed_text.emails) > 0 or len(parsed_text.phones) > 0 or len(
                parsed_text.times) > 0 or len(parsed_text.dates) > 0 or len(
                    parsed_text.links) > 0 or len(parsed_text.zip_codes) > 0:
        return True

    return False
Пример #24
0
 def parsedContent(this):
     openObject = open(this.path, "r")
     this.contents = openObject.read()
     this.parsed_text = CommonRegex(contents)
Пример #25
0
 def parsedContent(self):
     openObject = open(self.path, "r")
     self.contents = openObject.read()
     self.parsed_text = CommonRegex(self.contents)
Пример #26
0
def extractEntries(zipCode, page):
    entries = driver.find_elements_by_css_selector("td")
    countNewEntries = 0
    countDupes = 0
    countElements = 0
    pageDupes = set()
    for entry in entries:
        children = entry.find_elements_by_css_selector("div b a")

        if len(children) != 1:
            continue

        countElements += 1

        nameElements = entry.find_elements_by_css_selector("div b a")

        name = None
        phone = None
        address = None
        email = None
        url = None

        if nameElements:
            possibleNames = [elem.text for elem in nameElements if elem.text]
            if possibleNames:
                name = possibleNames[0]

        lines = entry.text.splitlines()

        for line in lines:
            parsed_text = CommonRegex(line)

            valid_urls = []
            if hasattr(parsed_text.links, '__call__'):
                if parsed_text.links():
                    valid_urls = [link for link in parsed_text.links() if 'gmail' not in link and 'yahoo' not in link and 'hotmail' not in 'link']
            else:
                if parsed_text.links:
                    valid_urls = [link for link in parsed_text.links if 'gmail' not in link and 'yahoo' not in link and 'hotmail' not in 'link']
            if valid_urls:
                url = valid_urls[0]

            if hasattr(parsed_text.emails, '__call__'):
                if parsed_text.emails():
                    email = parsed_text.emails()[0]
            else:
                if parsed_text.emails:
                    email = parsed_text.emails[0]

            if hasattr(parsed_text.phones, '__call__'):
                if parsed_text.phones():
                    phone = parsed_text.phones()[0]
            else:
                if parsed_text.phones:
                    phone = parsed_text.phones[0]

            if hasattr(parsed_text.street_addresses, '__call__'):
                if parsed_text.street_addresses():
                    address = parsed_text.street_addresses()[0]
            else:
                if parsed_text.street_addresses:
                    address = parsed_text.street_addresses[0]

        dataText = entry.text.replace("\n", " --- ")

        if name or phone or email:
            data = {
                "name": name,
                "phone": phone,
                "address": address,
                "email": email,
                "zip": zipCode,
                "url": url,
                "data": dataText
            }

            dedupeKey = str(data['name']) + str(data['email'])

            if dedupeKey not in dedupeKeys:
                countNewEntries += 1
                extracted.append(data)
                dedupeKeys.add(dedupeKey)
            elif dedupeKey not in pageDupes:
                countDupes += 1

            pageDupes.add(dedupeKey)

    print(f"    {zipCode}@{page}: Added {countNewEntries} new entries. Had {countDupes} dupes. Examined {countElements} elements")

    return countNewEntries > 0
Пример #27
0
def index():
    if request.method == 'GET':
        return 'OK'
    elif request.method == 'POST':
        # Store the IP address of the requester
        request_ip = ipaddress.ip_address(u'{0}'.format(request.remote_addr))

        # If GHE_ADDRESS is specified, use it as the hook_blocks.
        if os.environ.get('GHE_ADDRESS', None):
            hook_blocks = [os.environ.get('GHE_ADDRESS')]
        # Otherwise get the hook address blocks from the API.
        else:
            hook_blocks = requests.get(
                'https://api.github.com/meta').json()['hooks']

        if request.headers.get('X-GitHub-Event') == "ping":
            return json.dumps({'msg': 'Hi!'})

        if request.headers.get('X-GitHub-Event') == 'pull_request':
            if (request.json['pull_request']['user']['site_admin'] == 'false'):
                return json.dumps(
                    {'message': 'Pull request not submitted by site admin'})
            merge_state = request.json['pull_request']['state']
            merge_body = request.json['pull_request']['body']
            if (merge_state == 'closed'):
                print('Merge state closed')
                print('Merge Body: ' + merge_body)
                parsed_bounty_issue = re.findall(r"#(\w+)", merge_body)[0]
                addresses = CommonRegex(merge_body).btc_addresses[0]
                bounty_address = github.get_address_from_issue(
                    parsed_bounty_issue)
                amount = multisig_wallet.get_address_balance(bounty_address)
                try:
                    # use username to look up wallet Id
                    with open(DEFAULT_WALLET_PATH, 'r') as wallet:
                        data = json.loads(wallet.read())
                    for user in data:
                        try:
                            if (user['issue_number'] == int(
                                    parsed_bounty_issue)):
                                print('Wallet found')
                                wallet_name = user['wallet_name']
                                walletId = user[wallet_name]['walletId']
                        except:
                            print('Loading wallet..')

                except:
                    print('Wallet not found, creating new user...')

                # Set up sending of the bounty

                issue_title = wallet_name
                repository_path_encode = repository_path.encode('utf-8')
                issue_title_encode = issue_title.encode('utf-8')
                passphrase = hashlib.sha256(repository_path_encode +
                                            issue_title_encode).hexdigest()
                multisig_wallet.send_bitcoin_simple(walletId, str(addresses),
                                                    amount, passphrase)

                # Set up sending of the tweet

                usd_per_btc = requests.get(
                    'https://bitpay.com/api/rates/usd').json()['rate']
                bounty_in_btc = round((int(bounty_in_satoshi) / 10**8), 3)
                bounty_in_usd = round(bounty_in_btc * usd_per_btc, 2)
                url = 'https://github.com/21hackers/git-money/issues/' + parsed_bounty_issue
                twitter.send('Bounty Granted (' + amount + ' bits ~ $' +
                             bounty_in_usd + '): ' + issue_title + ' ' + url)

                return json.dumps({'message': 'Pull request received'})
            return json.dumps({'message': 'Pull request payout failed'})

        if request.headers.get('X-GitHub-Event') == 'issue_comment':
            comment_data = {
                'url': request.json['comment']['issue_url'],
                'payout_address': request.json['issue']['labels'][0]['name'],
                'payout_amount': request.json['issue']['labels'][1]['name'],
                'body': request.json['comment']['body']
            }
            print(comment_data)
            return json.dumps({'message': 'Issue comment received'})
Пример #28
0
 def setUp(self):
     self.parser = CommonRegex()
Пример #29
0
def echo():
    # Getting the data
    InputData = request.args.get('echoValue')  #unicode
    if InputData:
        EncodedInputData = InputData.encode("utf-8")

    else:
        return jsonify(Empty='Empty')
    SpacySmallData = SpacySmall(InputData)
    SpacyMediumData = SpacyMedium(InputData)
    places1 = GeoText(EncodedInputData)
    # Email Regex expression library
    email = []
    parsed_text = CommonRegex(EncodedInputData)
    email.append(parsed_text.emails)
    # URL Finding(URL Regex)
    URLMatcher = re.search(
        r'\(?\b(http://|www[.])[-A-Za-z0-9+&@#/%?=~_()|!:,.;]*[-A-Za-z0-9+&@#/%=~_()|]',
        EncodedInputData)
    url = []
    if URLMatcher:
        url.append(URLMatcher.group(0))
    places = GeoText(EncodedInputData)
    # Finding name using the Spacy library
    name = []
    for ent in SpacySmallData.ents:
        if ent.label_ == 'PERSON':
            name.append((ent.text))
    names = list(set(name) - set(places1.countries) - set(places1.cities))
    name1 = [x for x in names if not any(x1.isdigit() for x1 in x)]
    org = []
    # Finding the Organisation using the Spacy library
    for ent in SpacyMediumData.ents:
        if ent.label_ == 'ORG':
            org.append((ent.text))
    if not org:
        for ent in SpacySmallData.ents:
            if ent.label_ == 'ORG':
                org.append((ent.text))
    places2 = GeoText(EncodedInputData)

    language = ""
    # Detecting the language
    try:
        language = detect(InputData)
    except:
        pass

    # Distinguishing between Phone number,Fax number and Mobile Number
    TrainedSpacy = TrainedSpacyData(InputData)
    mob = []
    fax = []
    tel = []
    for ent in TrainedSpacy.ents:
        if ent.label_ == 'TEL':
            tel.append((ent.text))
        if ent.label_ == 'MOB':
            mob.append(ent.text)
        if ent.label_ == 'FAX':
            fax.append(ent.text)
#  Finding the word infront of the Phone number and classifying based on the word as Phone or Fax or Mob --- This is done using the Phonenumber Library.
    tel1 = []
    fax1 = []
    mob1 = []
    for match in phonenumbers.PhoneNumberMatcher(InputData, None):
        cnew = NaiveBayesClassifier(train)
        a = match.start
        start = match.start
        end = match.end
        # print a
        my = InputData
        # if (a > 1):
        try:
            word = my[:(a - 1)].split()[-1]
            # print word
            if (any(x.isalpha() for x in word)):
                if cnew.classify(word) is 'tel':
                    tel1.append(my[start:end])
                elif cnew.classify(word) is 'fax':
                    fax1.append(my[start:end])
                elif cnew.classify(word) is 'mob':
                    mob1.append(my[start:end])
            else:
                word2 = my[:(a - 1)].split()[-2]
                if cnew.classify(word2) is 'tel':
                    tel1.append(my[start:end])
                elif cnew.classify(word2) is 'fax':
                    fax1.append(my[start:end])
                elif cnew.classify(word2) is 'mob':
                    mob1.append(my[start:end])
        except IndexError:
            tel1.append(my[start:end])
# Find the word infront of the Phone NO & classifying based on the word as Phone/Fax/Mob - This is done using the Trained Model
    phone = TrainedSpacyData(InputData)
    for ent in phone.ents:
        if (ent.label_ == 'TEL' or ent.label_ == 'FAX' or ent.label_ == 'MOB'):
            cnew = NaiveBayesClassifier(train)
            a = ent.start_char
            # print a
            my = InputData
            try:
                word = my[:(a - 1)].split()[-1]
                if (any(x.isalpha() for x in word)):
                    # print (word)
                    if cnew.classify(word) is 'tel':
                        tel1.append(ent.text)
                    elif cnew.classify(word) is 'fax':
                        fax1.append(ent.text)
                    elif cnew.classify(word) is 'mob':
                        mob1.append(ent.text)
                else:
                    word2 = my[:(a - 1)].split()[-2]
                    # print (word2)
                    if cnew.classify(word2) is 'tel':
                        tel1.append(ent.text)
                    elif cnew.classify(word2) is 'fax':
                        fax1.append(ent.text)
                    elif cnew.classify(word2) is 'mob':
                        mob1.append(ent.text)
            except IndexError:
                tel1.append(ent.text)
    tel1 = [x for x in tel1 if sum(c.isdigit() for c in x) > 9]
    fax1 = [x for x in fax1 if sum(c.isdigit() for c in x) > 9]
    mob1 = [x for x in mob1 if sum(c.isdigit() for c in x) > 9]
    tel2 = list(set(tel1))
    fax2 = list(set(fax1))
    mob2 = list(set(mob1))
    # Remove Alphabetic in Telephone NO
    tel2 = removeAplhabet(tel2)
    fax2 = removeAplhabet(fax2)
    mob2 = removeAplhabet(mob2)
    # Title detetction
    # Finding Position entity by using library and model
    pos = []
    data = EncodedInputData.decode("utf-8")
    data = data.title()
    title = finder.findall(data)
    y = 0
    for x in title:
        pos.append(InputData[title[y][0]:title[y][1]])
        y += 1
    if not pos:
        for ent in TrainedSpacy.ents:
            if ent.label_ == 'POS':
                pos.append(ent.text)

    # Continuation of the Organisation to remove duplicate from the position
    for x in pos:
        for y in org:
            if x in y:
                org.remove(y)
    org1 = list(
        set(org) - set(places1.countries) - set(places1.cities) - set(name1))
    orgg = []
    for x in org1:
        orgg.append((x))

    # Address Detection
    AddressList = []
    # Using a PYAP library
    addresses = pyap.parse(EncodedInputData, country='US')
    for address in addresses:
        AddressList.append(str(address))

    # Identifying Address using Trained Spacy Model
    if not AddressList:
        for ent in phone.ents:
            if ent.label_ == 'add':
                AddressList.append(ent.text)
    # Identifying Address by finding the line where it has City
    if not AddressList:
        for line in EncodedInputData.splitlines():
            for city in places2.cities:
                if city in line:
                    AddressList.append(line)
            AddressList = [x for x in AddressList
                           if "Mobile" not in x]  # Limitation
            AddressList = [x for x in AddressList
                           if "MOBILE" not in x]  # Limitation

    # Identifying Address using Regex Expression
    if not AddressList:
        if places2.cities:
            r2 = re.compile(
                r'([(\d|-|/|/s|(A-Z)?){1-7}]+[,|-|\s]+[A-zZ]+[Aa-zZ]+.*)')
            add = r2.findall(EncodedInputData)
            # print add
            for text in add:
                for text2 in places.cities:
                    if text2 in text:
                        AddressList.append(text)
    wotex = list(set(AddressList))

    # Passing Address into the Geocoder-GOOGLE Library to extract into components
    import geocoder
    add_1 = []
    city_1 = []
    country_1 = []
    code_1 = []
    zip_1 = []
    county_1 = []
    state_1 = []
    for x in wotex:
        try:
            g1 = geocoder.google(x)
            if g1.postal:
                zip_1.append(g1.postal)
            add1 = ""
            if g1.housenumber:
                add1 = g1.housenumber
            if g1.street:
                add1 = add1 + " " + g1.street
            add_1.append(add1)
            if g1.country:
                code_1.append(g1.country)
            # city_1.append(g1.city)
            if g1.city:
                city_1.append(g1.city)
            if g1.country_long:
                country_1.append(g1.country_long)
            if g1.state_long:
                state_1.append(g1.state_long)
            if g1.county:
                county_1.append(g1.county)
            if not g1.city:
                place_new = GeoText(x)
                y = str(place_new.cities)
                g2 = geocoder.google(y)
                if g2.country:
                    code_1.append(g2.country)
                if g2.country_long:
                    country_1.append(g2.country_long)
                if g2.city:
                    city_1.append(g2.city)
                if g2.postal:
                    zip_1.append(g2.postal)
                if g2.county:
                    county_1.append(g2.county)
        except Exception:
            pass


# Passing Address into the Geocoder-OSM Library to extract into components
    add_OSM = []
    city_OSM = []
    zip_OSM = []
    code_OSM = []
    country_OSM = []
    county_OSM = []
    state_OSM = []
    for y in wotex:
        try:
            g21 = geocoder.osm(y)
            add2 = ""
            if g21.json and 'housenumber' in g21.json:
                add2 = g21.json['housenumber']
            if g21.json and 'street' in g21.json:
                add2 = add2 + ' ' + g21.json['street']
            add_OSM.append(add2)
            if g21.json and 'city' in g21.json:
                city_OSM.append(g21.json['city'])
            if g21.json and 'country' in g21.json:
                country_OSM.append(g21.json['country'])
            if g21.json and 'postal' in g21.json:
                zip_OSM.append(g21.json['postal'])
            if g21.json and g21.json['raw']['address']['country_code']:
                code_OSM.append(g21.json['raw']['address']['country_code'])
            if g21.json and 'county' in g21.json:
                county_OSM.append(g21.json['county'])
            if g21.json and 'state' in g21.json:
                state_OSM.append(g21.json['state'])
            if not city_OSM:
                placess = GeoText(y)
                x = str(placess.cities)
                print(x)
                g3 = geocoder.osm(x)
                print(g3.json)
                if g3.json and 'city' in g3.json:
                    city_OSM.append(g3.json['city'])
                if g3.json and 'country' in g3.json:
                    country_OSM.append(g3.json['country'])
                if g3.json and 'postal' in g3.json:
                    zip_OSM.append(g3.json['postal'])
                if g3.json and g3.json['raw']['address']['country_code']:
                    code_OSM.append(g3.json['raw']['address']['country_code'])
                if g3.json and 'county' in g3.json:
                    county_OSM.append(g3.json['county'])
                if g3.json and 'state' in g3.json:
                    state_OSM.append(g3.json['state'])
        except Exception:
            pass

    email_N = ','.join(map(unicode, parsed_text.emails))
    url_N = ','.join(map(unicode, url))
    name1_N = ','.join(map(unicode, name1))
    orgg_N = ','.join(map(unicode, orgg))
    try:
        wotex_N = ','.join(map(unicode, wotex))
    except Exception:
        wotex_N = ','.join(map(str, wotex))
    pos_N = ','.join(map(unicode, pos))
    tel_N = ','.join(map(unicode, tel))
    mob_N = ','.join(map(unicode, mob))
    fax_N = ','.join(map(unicode, fax))
    tel2_N = ','.join(map(unicode, tel2))
    fax2_N = ','.join(map(unicode, fax2))
    mob2_N = ','.join(map(unicode, mob2))
    add_1_N = ','.join(map(unicode, add_1))
    zip_1_N = ','.join(map(unicode, zip_1))
    code_1_N = ','.join(map(unicode, code_1))
    add_OSM_N = ','.join(map(unicode, add_OSM))
    zip_OSM_N = ','.join(map(unicode, zip_OSM))
    code_OSM_N = ','.join(map(unicode, code_OSM))
    city_1_N = ','.join(map(unicode, city_1))
    city_OSM_N = ','.join(map(unicode, city_OSM))
    country_1_N = ','.join(map(unicode, country_1))
    country_OSM_N = ','.join(map(unicode, country_OSM))
    state_1_N = ','.join(map(unicode, state_1))
    county_1_N = ','.join(map(unicode, county_1))
    # state_OSM_N = ','.join(map(unicode, state_OSM))
    # county_OSM_N = ','.join(map(unicode, county_OSM))

    return jsonify(Email=email_N,
                   Www=url_N,
                   Name=name1_N,
                   Organization=orgg_N,
                   FullAddress=wotex_N,
                   Role=pos_N,
                   Tel1=tel_N,
                   Mob1=mob_N,
                   Fax1=fax_N,
                   Phone=tel2_N,
                   Fax=fax2_N,
                   Mobile=mob2_N,
                   Address1Google=add_1_N,
                   ZipCodeGoogle=zip_1_N,
                   CountryCodeGoogle=code_1_N,
                   Address1Osm=add_OSM_N,
                   ZipCodeOsm=zip_OSM_N,
                   CountryCodeOsm=code_OSM_N,
                   CityGoogle=city_1_N,
                   CityOsm=city_OSM_N,
                   CountryGoogle=country_1_N,
                   CountryOsm=country_OSM_N,
                   StateGoogle=state_1_N,
                   CountyGoogle=county_1_N,
                   Language=language)
Пример #30
0
entities_list = []
from chunk import chunker
for sent in nltk.sent_tokenize(text):
    #print pos_tag(tokenize(sent))
    for chunk in chunker.parse(pos_tag(tokenize(sent))):
        #        print chunk
        entities_list.append(chunk)

with open('out.txt', 'w') as file:
    file.write(str(entities_list))
#print entities_list
import unicodecsv as csv
from commonregex import CommonRegex
from nltk.tag.stanford import StanfordNERTagger

parser = CommonRegex()
standford_ner = StanfordNERTagger(
    'classifiers/english.conll.4class.distsim.crf.ser.gz', 'stanford-ner.jar')
people = []
organizations = []
locations = []
emails = []
phone_numbers = []
street_addresses = []
credit_cards = []
ips = []
data = []

with open('sample-data.csv', 'r') as filedata:
    reader = csv.reader(filedata)
    for row in reader:
Пример #31
0
 def __init__(self, filepath):
     self.filepath = filepath
     self.parser = CommonRegex()
     self.standford_ner = StanfordNERTagger('classifiers/english.conll.4class.distsim.crf.ser.gz')
Пример #32
0
def is_other(df):
    other_dic = {}
    parser = CommonRegex()
    find_name = NameDataset()

    other_label, other_count = is_person_name(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_business_name(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_phone_number(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_address(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_street_name(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_city_agency(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_city(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_neighborhood(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_lat_lon_cord(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_zip_code(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_borough(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_school_name(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_color(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_car_make(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_area_of_study(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_subject_in_school(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_school_level(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_college_name(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_website(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_building_classification(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_vehicle_type(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_location_type(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_park_playground(df)
    other_dic[other_label] = other_count

    other_label, other_count = is_letter(df)
    other_dic[other_label] = other_count

    other_label_list = []
    other_label_count = []
    k = Counter(other_dic)
    top_three = k.most_common(3)
    for i in top_three:
        other_label_list.append(i[0])
        other_label_count.append(i[1])

    return other_label_list, other_label_count
            for i, d in enumerate(dictionary):
                if d[0] == word:
                    wordID = i
                    features_matrix[0, wordID] = words.count(word)

    return features_matrix


test_doc = 'travel-nontravel/tr3.txt'
doc_matrix = extract_features_for_single_doc(test_doc)

result3 = model1.predict(doc_matrix)
if result3 == 0:
    print "non travel"
else:
    print "travel"
print str(result3) + "\n"
if result3 == 1:
    f = open(test_doc, "r")
    if f.mode == 'r':
        contents = f.read()
        print contents
        #matches = datefinder.find_dates(contents)
        #for match in matches:
        #print match
        parsed_text = CommonRegex(contents)
        print(parsed_text.times)
        print(parsed_text.dates)
        print(parsed_text.street_addresses)
        print(parsed_text.btc_addresses)
Пример #34
0
 def setUp(self):
     self.parser = CommonRegex()
Пример #35
0
import requests
import json
from commonregex import CommonRegex
from .exceptions import *
from .helper import make_request
from sentry_sdk import capture_message, capture_exception
from urllib.parse import urlencode
import html

parser = CommonRegex()


class SongParser(object):
    base_url = "https://songwhip.com/api/"
    headers = {
        'Origin': 'https://songwhip.com',
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36 OPR/67.0.3575.53',
        'Content-Type': 'application/json'
    }
    # payload = "{\"url\":\"{}\",\"country\":\"IN\"}"
    payload = {"country": "IN", "url": ""}
    SONG_SERVICES_MAP = {
        "tidal": "Tidal",
        "deezer": "Deezer",
        "itunes": "iTunes",
        "pandora": "Pandora",
        "spotify": "Spotify",
        "youtube": "YouTube",
        "googleplay": "Google Play",
        "itunesStore": "iTunes Store",
    def __mark_merchant_line(text_lines, line_rect_list):
        """
            Check and mark the Merchant lines
        """
        merchant_rect = None
        name_line_list = []
        page_rect = line_rect_list['rect'][0]
        list_no_name = ['welcome', 'thank you', 'customer', 'copy', 'only', '*', 'ticket',
                        '(', ')', ':', 'invoice', '!', 'more', 'congratulation', 'bill']

        for i in range(len(text_lines)):
            # pre-processing of text line
            for j in range(i + 1, len(line_rect_list['text'])):
                if text_lines[i] == line_rect_list['text'][j]:
                    break

            line_rect = line_rect_list['rect'][j]

            text_lines[i] = text_lines[i].replace('Welcome to', '')
            text_lines[i] = text_lines[i].strip('-')

            # check contains of key in list_no_name
            f_check_no_list = False
            for j in range(len(list_no_name)):
                if text_lines[i].lower().__contains__(list_no_name[j]):
                    f_check_no_list = True
                    break

            if f_check_no_list:
                continue

            # check validation of key
            if len(text_lines[i]) <= 2:
                continue
            elif len(name_line_list) > 0 and name_line_list[-1] + 1 != i:
                break
            elif len(name_line_list) > 0 and text_lines[i].__contains__(text_lines[name_line_list[-1]]):
                continue
            elif len(name_line_list) > 2:
                continue
            elif len(name_line_list) > 1 and not text_lines[i].isupper():
                continue
            elif text_lines[i][0] == '#':
                continue
            elif len(CommonRegex(text_lines[i]).dates) > 0:
                continue
            elif len(CommonRegex(text_lines[i]).phones) > 0:
                continue
            elif len(CommonRegex(text_lines[i]).links) > 0:
                continue
            elif len(text_lines[i].replace('@', '').replace('&', '').split()) > 5:
                continue
            elif len(text_lines[i].split()) > 3 and text_lines[i].__contains__('-'):
                continue
            elif text_lines[i].replace('-', '').replace(' ', '').isdigit():  # '305337 - 1'
                continue
            elif len(name_line_list) > 0 and line_rect[1] > 2 * merchant_rect[3] - merchant_rect[1]:
                continue
            elif (line_rect[0] + line_rect[2]) > (page_rect[0] + page_rect[2]) * 1.3:   # check the position
                continue

            name_line_list.append(i)
            merchant_rect = line_rect

        return name_line_list
Пример #37
0
def sanitize(text, sanitized_filename):

    # Remove web URLs
    sanitized = re.sub(r"http\S+", "", text)

    # Remove HTML tags
    sanitized = re.sub('<[^<]+?>', '', sanitized)

    # Remove HTML characters
    sanitized = re.sub('&[^ ]+;', '', sanitized)

    # Remove state abbreviations
    sanitized = re.sub(
        '(?<!\w)(?:,\s+)?(?:A[LKZR]|C[AOT]|DE|FL|GA|HI|I[ADLN]|K[SY]|LA|M[EDAINSOT]|N[EVHJMYCD]|O[HKR]|PA|RI|S[CD]|T[NX]|UT|V[AT]|W[AIVY]),?\s?(?!\w)',
        '', sanitized)

    fo = open('unique_senders.txt')
    try:
        text_ = fo.read()
    finally:
        fo.close()

    # Load natural language processor and add custom organizations
    nlp = spacy.load("en_core_web_lg")
    ruler = nlp.add_pipe("entity_ruler")
    unique_senders = set(text_.split('\n'))
    patterns = list()
    for unique_sender in unique_senders:
        pattern = {"label": "ORG", "pattern": [{"LOWER": unique_sender}]}
        patterns.append(pattern)
    ruler.add_patterns(patterns)

    lemmatizer = WordNetLemmatizer()
    sentences = nltk.sent_tokenize(sanitized)
    counter = 0
    num_sentences = len(sentences)
    res = ''
    print('There are %d sentences' % (num_sentences))

    # Write the result
    try:
        sanitized_file = open(sanitized_filename, "w")
        for sentence in sentences:
            counter += 1
            if counter % 1000 == 0 or counter == num_sentences:
                print('%d sentences sanitized (%.2f%% complete)' %
                      (counter, float(counter / num_sentences) * 100))

            if is_stop_sentence(sentence):
                continue

            #print('1 ' + sentence)
            doc = nlp(sentence)
            for ent in doc.ents:
                #print('ent: %s ent.label: %s' % (ent.text, ent.label_))
                if ent.label_ == 'ORG' or ent.label_ == 'NORP':
                    sentence = sentence.replace(ent.text, 'BUSINESSNAME')
                elif ent.label_ == 'PRODUCT':
                    sentence = sentence.replace(ent.text, 'PRODUCTNAME')
                elif ent.label_ == 'PERSON' and ent.label_ == 'GPE':
                    sentence = sentence.replace(ent.text, '')

            #print('2 ' + sentence)
            # Remove times, dates, and prices
            parsed_text = CommonRegex(sentence)
            for price in parsed_text.prices:
                sentence = sentence.replace(price, '')

            #print('3 ' + sentence)
            # Remove non-dictionary words
            words = set(nltk.corpus.words.words())
            puncts = {'.', '"', "'", ',', '-', '%', '!', '?'}
            etcs = {'email', 'online'}
            words = words.union(puncts).union(etcs)
            tokens = nltk.wordpunct_tokenize(sentence)
            pos_tags = nltk.pos_tag(tokens)
            #print(pos_tags)

            for token_idx, token in enumerate(tokens):
                syntactic_category = get_syntactic_category(
                    pos_tags[token_idx][1])
                if syntactic_category != None:
                    lemmatized_token = lemmatizer.lemmatize(
                        token.lower(), syntactic_category)
                    #print('lemmatized: %s original: %s' % (lemmatized_token, token))
                    if lemmatized_token.lower(
                    ) not in words and token != 'BUSINESSNAME' and token != 'PRODUCTNAME' and not token.isnumeric(
                    ):
                        sentence = sentence.replace(token, '')
                else:
                    if token.lower(
                    ) not in words and token != 'BUSINESSNAME' and token != 'PRODUCTNAME' and not token.isnumeric(
                    ):
                        sentence = sentence.replace(token, '')

            #print('4 ' + sentence)

            # Remove extra whitespaces
            sentence = re.sub(r'\s+', ' ', sentence)
            sentence = re.sub(r' , ', ' ', sentence)
            sentence = re.sub(r' \.', '.', sentence)
            sentence = re.sub(r' !', '!', sentence)
            sentence = re.sub(r' \?', '?', sentence)
            #print('5 ' + sentence)

            sanitized_file.write(sentence + '\n')

    finally:
        sanitized_file.close()