예제 #1
0
 def __init__(self):
     self.logger = logging.getLogger('horus')
     self.conf = HorusConfig()
     if len(self.logger.handlers) == 0:
         self.logger.setLevel(logging.DEBUG)
         now = datetime.datetime.now()
         handler = logging.FileHandler(self.conf.dir_log + 'horus_' +
                                       now.strftime("%Y-%m-%d") + '.log')
         formatter = logging.Formatter(
             "%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s"
         )
         handler.setFormatter(formatter)
         self.logger.addHandler(handler)
         consoleHandler = logging.StreamHandler()
         consoleHandler.setFormatter(formatter)
         self.logger.addHandler(consoleHandler)
예제 #2
0
def main():

    op = OptionParser(usage='usage: %prog [options] arguments (example: main.py --text="paris hilton was once the toast of the town."')

    op.add_option("--text", dest="text", help="The text to be annotated")
    op.add_option("--file", dest="file", help="The file to be annotated")
    op.add_option("--ds_format", dest="ds_format", default=0, help="The format to be annotated [0 = input text (default), 1 = Ritter, 2 = CoNNL]")
    op.add_option("--output_file", dest="output_file", default="horus_out", help="The output file")
    op.add_option("--output_format", dest="output_format", default="json", help="The output file type")

    (opts, args) = op.parse_args()
    print(__doc__)
    op.print_help()

    if not opts.text and not opts.file:
        op.error('inform either an [text] or [file] as parameter!')

    config = HorusConfig()
    extractor = FeatureExtraction(config, load_sift=1, load_tfidf=1, load_cnn=0, load_topic_modeling=1)
    print(extractor.extract_features_text(opts.text))
예제 #3
0
        try:
            text = self.clean_text(text)
            if len(str(text)) == 0:
                return ''
            if isinstance(text, unicode) == True:
                text = text.encode('ascii', 'ignore')

            params = {'text': text, 'to': to_lang}
            h = self.get_header()
            print(h)
            translationData = requests.get(self.translateUrl,
                                           params=params,
                                           headers=h)  #urllib.urlencode()
            if translationData.status_code != 200:
                raise Exception(':: error: bing translation status code: ' +
                                str(translationData.status_code) + ' - ' +
                                str(translationData.text))
            translation = ElementTree.fromstring(
                translationData.text.encode('utf-8'))
            return translation.text
        except:
            raise


if __name__ == "__main__":
    config = HorusConfig()
    t = BingTranslator(config)
    print t.translate("hey what's up dude?", 'pt-br')
    print t.detect_language("Que lingua estou falando, amigo?")
    print t.detect_language("Green Newsfeed")
    print t.translate("Green Newsfeed", 'pt')
예제 #4
0
 def __init__(self):
     config = HorusConfig()
     self.stanford_ner = StanfordNERTagger(self.config.model_stanford_filename_ner, self.config.model_stanford_path_jar_ner)
     self.stanford_pos = StanfordPOSTagger(self.config.model_stanford_filename_pos, self.config.model_stanford_path_jar_pos)
     self.stanford_pos.java_options='-mx8g'
     self.word2vec_google = gensim.models.KeyedVectors.load_word2vec_format(config.embeddings_path, binary=True)
예제 #5
0
import spacy
import en_core_web_sm
import os

from src.config import HorusConfig

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

nlp = en_core_web_sm.load()
#spacy.load('en')

conf = HorusConfig()
import shorttext
#emb = '/Volumes/dne5data/embeddings/GoogleNews-vectors-negative300.bin.gz'
#emb = '/Users/diegoesteves/Downloads/GoogleNews-vectors-negative300 (1).bin'

dict = {
    'per': [
        'arnett', 'david', 'richard', 'james', 'frank', 'george', 'misha',
        'students', 'education', 'coach', 'football', 'turkish', 'albanian',
        'romanian', 'professor', 'lawyer', 'president', 'king', 'man', 'woman',
        'danish', 'we', 'he', 'their', 'born', 'directed', 'died', 'lives',
        'boss', 'syrian', 'elected', 'minister', 'candidate', 'daniel',
        'robert', 'dude', 'guy', 'girl', 'woman', 'husband', 'actor', 'people',
        'celebrity'
    ],
    'loc': [
        'china', 'usa', 'germany', 'leipzig', 'alaska', 'poland', 'jakarta',
        'kitchen', 'house', 'brazil', 'fuji', 'prison', 'portugal', 'lisbon',
        'france', 'oslo', 'airport', 'road', 'highway', 'forest', 'sea',
        'lake', 'stadium', 'hospital', 'temple', 'beach', 'hotel', 'country',